SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
StreamingVwFeatures.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
3  * embodied in the content of this file are licensed under the BSD
4  * (revised) open source license.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Written (W) 2011 Shashwat Lal Das
12  * Adaptation of Vowpal Wabbit v5.1.
13  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
14  */
15 
17 
18 using namespace shogun;
19 
21 {
22  init();
24 }
25 
27  bool is_labelled, int32_t size)
29 {
30  init(file, is_labelled, size);
32 }
33 
35  bool is_labelled, int32_t size)
37 {
38  init(file, is_labelled, size);
40 }
41 
43 {
44  if (parser.is_running())
45  parser.end_parser();
46  SG_UNREF(env);
47 }
48 
50 {
51  return new CStreamingVwFeatures(*this);
52 }
53 
55 {
56  parser.set_read_vector(&CStreamingFile::get_vector);
57 }
58 
60 {
61  parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
62 }
63 
65 {
67  {
69  parser.exit_parser();
70  parser.init(working_file, has_labels, parser.get_ring_size());
71  parser.set_free_vector_after_release(false);
72  parser.start_parser();
73  }
74  else
75  SG_ERROR("The input cannot be reset! Please use 1 pass.\n")
76 }
77 
79 {
80  SG_REF(env);
81  return env;
82 }
83 
85 {
86  env = vw_env;
87  SG_REF(env);
88 }
89 
91 {
92  int32_t dim = 1 << env->num_bits;
93  if (dim > len)
94  {
95  vec = SG_REALLOC(float32_t, vec, len, dim);
96  memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
97  len = dim;
98  }
99 }
100 
102 {
103  int32_t dim = 1 << env->num_bits;
104  if (dim > len)
105  {
106  vec = SG_REALLOC(float64_t, vec, len, dim);
107  memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
108  len = dim;
109  }
110 }
111 
113 {
114  float32_t wprime = 0;
115  if (gravity < fabsf(w))
116  wprime = CMath::sign(w)*(fabsf(w) - gravity);
117  return wprime;
118 }
119 
121 {
122  return current_length;
123 }
124 
126 {
127  if (current_example)
128  return 1;
129  else
130  return 0;
131 }
132 
134 {
135  return F_DREAL;
136 }
137 
138 void CStreamingVwFeatures::init()
139 {
140  working_file=NULL;
141  seekable=false;
142  current_length=-1;
143  current_example=NULL;
144  env=NULL;
145 
146  example_count = 0;
147 }
148 
149 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
150 {
151  init();
152  has_labels = is_labelled;
153  working_file = file;
154  parser.init(file, is_labelled, size);
155  parser.set_free_vector_after_release(false);
156  seekable=false;
157 
158  // Get environment from the StreamingVwFile
159  env = ((CStreamingVwFile*) file)->get_env();
160  SG_REF(env);
161 }
162 
163 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
164 {
165  init();
166  has_labels = is_labelled;
167  working_file = file;
168  parser.init(file, is_labelled, size);
169  parser.set_free_vector_after_release(false);
170  seekable=true;
171 
172  // Get environment from the StreamingVwFile
173  env = ((CStreamingVwCacheFile*) file)->get_env();
174  SG_REF(env);
175 }
176 
177 void CStreamingVwFeatures::setup_example(VwExample* ae)
178 {
179  ae->pass = env->passes_complete;
180  ae->num_features = 0;
181  ae->total_sum_feat_sq = 1;
183  ae->global_weight = ae->ld->weight;
184  env->t += ae->global_weight;
185  ae->example_t = env->t;
186 
187  // If some namespaces should be ignored, remove them
188  if (env->ignore_some)
189  {
190  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
191  if (env->ignore[*i])
192  {
193  ae->atomics[*i].erase();
194  memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
195  ae->indices.end--;
196  i--;
197  }
198  }
199 
200  // Add constant feature
201  vw_size_t constant_namespace = 128;
202  VwFeature temp = {1,constant_hash & env->mask};
203  ae->indices.push(constant_namespace);
204  ae->atomics[constant_namespace].push(temp);
205  ae->sum_feat_sq[constant_namespace] = 0;
206 
207  if(env->stride != 1)
208  {
209  // Make room for per-feature information.
210  vw_size_t stride = env->stride;
211  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
212  for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
213  j->weight_index = j->weight_index*stride;
214  }
215 
216  for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
217  {
218  ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
219  ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
220  }
221 
222  // For quadratic features
223  for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
224  {
225  char* i = env->pairs.get_element(k);
226 
227  ae->num_features
228  += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
229  *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
230 
231  ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
232  }
233 }
234 
236 {
237  if (!parser.is_running())
238  parser.start_parser();
239 }
240 
242 {
243  parser.end_parser();
244 }
245 
247 {
248  bool ret_value;
249  ret_value = (bool) parser.get_next_example(current_example,
251  current_label);
252  if (current_length < 1)
253  return false;
254 
255  if (ret_value)
256  setup_example(current_example);
257  else
258  return false;
259 
262 
263  return ret_value;
264 }
265 
267 {
268  return current_example;
269 }
270 
272 {
274 
275  return current_label;
276 }
277 
279 {
280  env->example_number++;
282 
283  if (current_example->ld->label == FLT_MAX)
284  env->weighted_labels += 0;
285  else
287 
290 
292  parser.finalize_example();
293 }
294 
296 {
297  return current_length;
298 }
299 
301 {
303  return CMath::INFTY;
304 }
305 
307 {
308  float32_t ret = 0.;
309  for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
310  {
311  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
312  ret += vec2[f->weight_index & env->thread_mask] * f->x;
313  }
314  return ret;
315 }
316 
318 {
319  return dense_dot(current_example, vec2);
320 }
321 
323 {
324  float32_t ret = 0.;
325  for (int32_t i = 0; i < vec1->num_feat_entries; i++)
326  ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
327 
328  return ret;
329 }
330 
332 {
333  float32_t ret = 0.;
334  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
335  {
336  for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
337  {
338  float32_t w = vec2[f->weight_index & env->thread_mask];
339  float32_t wprime = real_weight(w,gravity);
340  ret += wprime*f->x;
341  }
342  }
343 
344  return ret;
345 }
346 
347 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
348 {
349  if (abs_val)
350  {
351  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
352  {
353  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
354  vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
355  }
356  }
357  else
358  {
359  for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
360  {
361  for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
362  vec2[f->weight_index & env->thread_mask] += alpha * f->x;
363  }
364  }
365 }
366 
367 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
368 {
369  add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
370 }
371 
373 {
374  return current_length;
375 }
376 
378 {
379  return C_STREAMING_VW;
380 }
virtual void reset_stream()
Definition: StreamingFile.h:69
uint32_t vw_size_t
vw_size_t typedef to work across platforms
Definition: vw_constants.h:26
virtual void set_env(CVwEnvironment *vw_env)
T get_element(int32_t index) const
Definition: DynArray.h:142
float64_t weighted_examples
Weighted examples.
T * end
Pointer to last set element in the array.
Definition: v_array.h:160
virtual EFeatureClass get_feature_class() const
virtual float32_t dot(CStreamingDotFeatures *df)
T * begin
Pointer to first element of the array.
Definition: v_array.h:157
static const float64_t INFTY
infinity
Definition: Math.h:2048
CVwEnvironment * env
Environment for VW.
Class CVwEnvironment is the environment used by VW.
Definition: VwEnvironment.h:41
virtual void add_to_dense_vec(float32_t alpha, VwExample *&ex, float32_t *vec2, int32_t vec2_len, bool abs_val=false)
VwExample * current_example
Example currently being processed.
virtual bool is_seekable()
Definition: StreamingFile.h:64
virtual int32_t get_num_vectors() const
bool has_labels
Whether examples are labelled or not.
vw_size_t num_features
Number of features.
Definition: vw_example.h:89
int64_t example_number
Example number.
float32_t total_sum_feat_sq
Total sum of square of features.
Definition: vw_example.h:106
virtual void get_vector(bool *&vector, int32_t &len)
#define SG_ERROR(...)
Definition: SGIO.h:129
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
vw_size_t num_bits
log_2 of the number of features
virtual int32_t get_dim_feature_space() const
int32_t get_num_elements() const
Definition: DynArray.h:130
virtual EFeatureType get_feature_type() const
float64_t sum_feat_sq[256]
Sum of square of features.
Definition: vw_example.h:104
#define SG_REF(x)
Definition: SGObject.h:51
void push(const T &new_elem)
Definition: v_array.h:168
CStreamingFile * working_file
The StreamingFile object to read from.
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
float32_t loss
Loss.
Definition: vw_example.h:95
float32_t label
Label value.
Definition: vw_label.h:92
vw_size_t pass
Pass.
Definition: vw_example.h:91
v_array< vw_size_t > indices
Array of namespaces.
Definition: vw_example.h:84
virtual float32_t dense_dot_truncated(const float32_t *vec2, VwExample *&ex, float32_t gravity)
float64_t current_label
The current example's label.
virtual CVwEnvironment * get_env()
float32_t weight
Weight of example.
Definition: vw_label.h:94
#define ASSERT(x)
Definition: SGIO.h:201
float64_t weighted_labels
Weighted labels.
bool ignore_some
Whether some namespaces are ignored.
double float64_t
Definition: common.h:50
DynArray< char * > pairs
Pairs of features to cross for quadratic updates.
const int32_t constant_hash
Constant used to access the constant feature.
Definition: vw_constants.h:32
vw_size_t stride
Number of elements in weight vector per feature.
virtual int32_t get_nnz_features_for_vector()
vw_size_t example_counter
Example counter.
Definition: vw_example.h:109
float32_t t
Value of t.
virtual float32_t real_weight(float32_t w, float32_t gravity)
Example class for VW.
Definition: vw_example.h:58
Streaming features that support dot products among other operations.
float32_t example_t
t value for this example
Definition: vw_example.h:101
SGSparseVectorEntry< T > * features
vw_size_t mask
Mask used for hashing.
Class StreamingVwCacheFile to read vector-by-vector from VW cache files.
vw_size_t total_features
Total number of features.
float float32_t
Definition: common.h:49
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
Class StreamingVwFile to read vector-by-vector from Vowpal Wabbit data files. It reads the example an...
float32_t global_weight
Global weight.
Definition: vw_example.h:99
virtual void get_vector_and_label(bool *&vector, int32_t &len, float64_t &label)
One feature in VW.
Definition: vw_example.h:34
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
int32_t current_length
Number of features in current example.
static T sign(T a)
Definition: Math.h:426
bool seekable
Whether the stream is seekable.
The class Features is the base class of all feature objects.
Definition: Features.h:68
VwLabel * ld
Label object.
Definition: vw_example.h:79
template class SGSparseVector The assumtion is that the stored SGSparseVectorEntry* vector is orde...
vw_size_t thread_mask
Mask used by regressor for learning.
virtual float32_t dense_dot(VwExample *&ex, const float32_t *vec2)
vw_size_t passes_complete
Number of passes complete.
bool ignore[256]
Which namespaces to ignore.
vw_size_t example_count
Number of examples processed at a point of time.
CInputParser< VwExample > parser
The parser object, which reads from input and returns parsed example objects.
virtual void expand_if_required(float32_t *&vec, int32_t &len)
float64_t sum_loss
Sum of losses.
v_array< VwFeature > atomics[256]
Array of features.
Definition: vw_example.h:86

SHOGUN 机器学习工具包 - 项目文档