SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
StreamingVwFeatures.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
3  * embodied in the content of this file are licensed under the BSD
4  * (revised) open source license.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Written (W) 2011 Shashwat Lal Das
12  * Adaptation of Vowpal Wabbit v5.1.
13  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
14  */
15 
17 
18 using namespace shogun;
19 
21 {
22  init();
24 }
25 
27  bool is_labelled, int32_t size)
29 {
30  init(file, is_labelled, size);
32 }
33 
35  bool is_labelled, int32_t size)
37 {
38  init(file, is_labelled, size);
40 }
41 
43 {
44  if (parser.is_running())
45  parser.end_parser();
46  SG_UNREF(env);
47 }
48 
50 {
51  return new CStreamingVwFeatures(*this);
52 }
53 
55 {
56  parser.set_read_vector(&CStreamingFile::get_vector);
57 }
58 
60 {
61  parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
62 }
63 
65 {
67  {
69  parser.exit_parser();
70  parser.init(working_file, has_labels, parser.get_ring_size());
71  parser.set_free_vector_after_release(false);
72  parser.start_parser();
73  }
74  else
75  SG_ERROR("The input cannot be reset! Please use 1 pass.\n")
76 }
77 
79 {
80  return env;
81 }
82 
84 {
85  if(vw_env!=env)
86  {
87  SG_REF(vw_env);
88  SG_UNREF(env);
89  env = vw_env;
90  }
91 }
92 
94 {
95  int32_t dim = 1 << env->num_bits;
96  if (dim > len)
97  {
98  vec = SG_REALLOC(float32_t, vec, len, dim);
99  memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
100  len = dim;
101  }
102 }
103 
105 {
106  int32_t dim = 1 << env->num_bits;
107  if (dim > len)
108  {
109  vec = SG_REALLOC(float64_t, vec, len, dim);
110  memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
111  len = dim;
112  }
113 }
114 
116 {
117  float32_t wprime = 0;
118  if (gravity < fabsf(w))
119  wprime = CMath::sign(w)*(fabsf(w) - gravity);
120  return wprime;
121 }
122 
124 {
125  return current_length;
126 }
127 
129 {
130  if (current_example)
131  return 1;
132  else
133  return 0;
134 }
135 
137 {
138  return F_DREAL;
139 }
140 
141 void CStreamingVwFeatures::init()
142 {
143  working_file=NULL;
144  seekable=false;
145  current_length=-1;
146  current_example=NULL;
147  env=NULL;
148 
149  example_count = 0;
150 }
151 
152 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
153 {
154  init();
155  has_labels = is_labelled;
156  working_file = file;
157  parser.init(file, is_labelled, size);
158  parser.set_free_vector_after_release(false);
159  seekable=false;
160 
161  // Get environment from the StreamingVwFile
162  env = ((CStreamingVwFile*) file)->get_env();
163  SG_REF(env);
164 }
165 
166 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
167 {
168  init();
169  has_labels = is_labelled;
170  working_file = file;
171  parser.init(file, is_labelled, size);
172  parser.set_free_vector_after_release(false);
173  seekable=true;
174 
175  // Get environment from the StreamingVwFile
176  env = ((CStreamingVwCacheFile*) file)->get_env();
177  SG_REF(env);
178 }
179 
180 void CStreamingVwFeatures::setup_example(VwExample* ae)
181 {
182  ae->pass = env->passes_complete;
183  ae->num_features = 0;
184  ae->total_sum_feat_sq = 1;
186  ae->global_weight = ae->ld->weight;
187  env->t += ae->global_weight;
188  ae->example_t = env->t;
189 
190  // If some namespaces should be ignored, remove them
191  if (env->ignore_some)
192  {
193  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
194  if (env->ignore[*i])
195  {
196  ae->atomics[*i].erase();
197  memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
198  ae->indices.end--;
199  i--;
200  }
201  }
202 
203  // Add constant feature
204  vw_size_t constant_namespace = 128;
205  VwFeature temp = {1,constant_hash & env->mask};
206  ae->indices.push(constant_namespace);
207  ae->atomics[constant_namespace].push(temp);
208  ae->sum_feat_sq[constant_namespace] = 0;
209 
210  if(env->stride != 1)
211  {
212  // Make room for per-feature information.
213  vw_size_t stride = env->stride;
214  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
215  for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
216  j->weight_index = j->weight_index*stride;
217  }
218 
219  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
220  {
221  ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
222  ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
223  }
224 
225  // For quadratic features
226  for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
227  {
228  char* i = env->pairs.get_element(k);
229 
230  ae->num_features
231  += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
232  *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
233 
234  ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
235  }
236 }
237 
239 {
240  if (!parser.is_running())
241  parser.start_parser();
242 }
243 
245 {
246  parser.end_parser();
247 }
248 
250 {
251  bool ret_value;
252  ret_value = (bool) parser.get_next_example(current_example,
254  current_label);
255  if (current_length < 1)
256  return false;
257 
258  if (ret_value)
259  setup_example(current_example);
260  else
261  return false;
262 
265 
266  return ret_value;
267 }
268 
270 {
271  return current_example;
272 }
273 
275 {
277 
278  return current_label;
279 }
280 
282 {
283  env->example_number++;
285 
286  if (current_example->ld->label == FLT_MAX)
287  env->weighted_labels += 0;
288  else
290 
293 
295  parser.finalize_example();
296 }
297 
299 {
300  return current_length;
301 }
302 
304 {
306  return CMath::INFTY;
307 }
308 
310 {
311  float32_t ret = 0.;
312  for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
313  {
314  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
315  ret += vec2[f->weight_index & env->thread_mask] * f->x;
316  }
317  return ret;
318 }
319 
321 {
322  return dense_dot(current_example, vec2);
323 }
324 
326 {
327  float32_t ret = 0.;
328  for (int32_t i = 0; i < vec1->num_feat_entries; i++)
329  ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
330 
331  return ret;
332 }
333 
335 {
336  float32_t ret = 0.;
337  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
338  {
339  for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
340  {
341  float32_t w = vec2[f->weight_index & env->thread_mask];
342  float32_t wprime = real_weight(w,gravity);
343  ret += wprime*f->x;
344  }
345  }
346 
347  return ret;
348 }
349 
350 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
351 {
352  if (abs_val)
353  {
354  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
355  {
356  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
357  vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
358  }
359  }
360  else
361  {
362  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
363  {
364  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
365  vec2[f->weight_index & env->thread_mask] += alpha * f->x;
366  }
367  }
368 }
369 
370 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
371 {
372  add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
373 }
374 
376 {
377  return current_length;
378 }
379 
381 {
382  return C_STREAMING_VW;
383 }
virtual void reset_stream()
Definition: StreamingFile.h:69
uint32_t vw_size_t
vw_size_t typedef to work across platforms
Definition: vw_constants.h:26
virtual void set_env(CVwEnvironment *vw_env)
T get_element(int32_t index) const
Definition: DynArray.h:142
float64_t weighted_examples
Weighted examples.
T * end
Pointer to last set element in the array.
Definition: v_array.h:160
virtual EFeatureClass get_feature_class() const
virtual float32_t dot(CStreamingDotFeatures *df)
T * begin
Pointer to first element of the array.
Definition: v_array.h:157
static const float64_t INFTY
infinity
Definition: Math.h:2048
CVwEnvironment * env
Environment for VW.
Class CVwEnvironment is the environment used by VW.
Definition: VwEnvironment.h:41
virtual void add_to_dense_vec(float32_t alpha, VwExample *&ex, float32_t *vec2, int32_t vec2_len, bool abs_val=false)
VwExample * current_example
Example currently being processed.
virtual bool is_seekable()
Definition: StreamingFile.h:64
virtual int32_t get_num_vectors() const
bool has_labels
Whether examples are labelled or not.
vw_size_t num_features
Number of features.
Definition: vw_example.h:89
int64_t example_number
Example number.
float32_t total_sum_feat_sq
Total sum of square of features.
Definition: vw_example.h:106
virtual void get_vector(bool *&vector, int32_t &len)
#define SG_ERROR(...)
Definition: SGIO.h:129
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
vw_size_t num_bits
log_2 of the number of features
virtual int32_t get_dim_feature_space() const
int32_t get_num_elements() const
Definition: DynArray.h:130
virtual EFeatureType get_feature_type() const
float64_t sum_feat_sq[256]
Sum of square of features.
Definition: vw_example.h:104
#define SG_REF(x)
Definition: SGObject.h:54
void push(const T &new_elem)
Definition: v_array.h:168
CStreamingFile * working_file
The StreamingFile object to read from.
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
float32_t loss
Loss.
Definition: vw_example.h:95
float32_t label
Label value.
Definition: vw_label.h:92
vw_size_t pass
Pass.
Definition: vw_example.h:91
v_array< vw_size_t > indices
Array of namespaces.
Definition: vw_example.h:84
virtual float32_t dense_dot_truncated(const float32_t *vec2, VwExample *&ex, float32_t gravity)
float64_t current_label
The current example's label.
virtual CVwEnvironment * get_env()
float32_t weight
Weight of example.
Definition: vw_label.h:94
#define ASSERT(x)
Definition: SGIO.h:201
float64_t weighted_labels
Weighted labels.
bool ignore_some
Whether some namespaces are ignored.
double float64_t
Definition: common.h:50
DynArray< char * > pairs
Pairs of features to cross for quadratic updates.
const int32_t constant_hash
Constant used to access the constant feature.
Definition: vw_constants.h:32
vw_size_t stride
Number of elements in weight vector per feature.
virtual int32_t get_nnz_features_for_vector()
vw_size_t example_counter
Example counter.
Definition: vw_example.h:109
float32_t t
Value of t.
virtual float32_t real_weight(float32_t w, float32_t gravity)
Example class for VW.
Definition: vw_example.h:58
Streaming features that support dot products among other operations.
float32_t example_t
t value for this example
Definition: vw_example.h:101
SGSparseVectorEntry< T > * features
vw_size_t mask
Mask used for hashing.
Class StreamingVwCacheFile to read vector-by-vector from VW cache files.
vw_size_t total_features
Total number of features.
float float32_t
Definition: common.h:49
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
Class StreamingVwFile to read vector-by-vector from Vowpal Wabbit data files. It reads the example an...
float32_t global_weight
Global weight.
Definition: vw_example.h:99
virtual void get_vector_and_label(bool *&vector, int32_t &len, float64_t &label)
One feature in VW.
Definition: vw_example.h:34
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
int32_t current_length
Number of features in current example.
static T sign(T a)
Definition: Math.h:426
bool seekable
Whether the stream is seekable.
The class Features is the base class of all feature objects.
Definition: Features.h:68
VwLabel * ld
Label object.
Definition: vw_example.h:79
template class SGSparseVector The assumtion is that the stored SGSparseVectorEntry* vector is orde...
vw_size_t thread_mask
Mask used by regressor for learning.
virtual float32_t dense_dot(VwExample *&ex, const float32_t *vec2)
vw_size_t passes_complete
Number of passes complete.
bool ignore[256]
Which namespaces to ignore.
vw_size_t example_count
Number of examples processed at a point of time.
CInputParser< VwExample > parser
The parser object, which reads from input and returns parsed example objects.
virtual void expand_if_required(float32_t *&vec, int32_t &len)
float64_t sum_loss
Sum of losses.
v_array< VwFeature > atomics[256]
Array of features.
Definition: vw_example.h:86

SHOGUN Machine Learning Toolbox - Documentation