SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingSparseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
11 namespace shogun
12 {
13 
14 template <class T>
16 {
18  init();
19 }
20 
21 template <class T>
23  bool is_labelled,
24  int32_t size)
26 {
28  init(file, is_labelled, size);
29 }
30 
31 template <class T>
33 {
34  parser.end_parser();
35 }
36 
37 template <class T>
39 {
40  ASSERT(index>=0 && index<current_num_features);
41 
42  T ret=0;
43 
44  if (current_vector)
45  {
46  for (int32_t i=0; i<current_length; i++)
47  if (current_vector[i].feat_index==index)
48  ret += current_vector[i].entry;
49  }
50 
51  return ret;
52 }
53 
54 template <class T>
56 {
57 }
58 
59 template <class T>
61 {
62  int32_t n=current_num_features;
63  ASSERT(n<=num);
64  current_num_features=num;
65  return n;
66 }
67 
68 template <class T>
70 {
71  int32_t dim = get_dim_feature_space();
72  if (dim > len)
73  {
74  vec = SG_REALLOC(float32_t, vec, dim);
75  memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
76  len = dim;
77  }
78 }
79 
80 template <class T>
82 {
83  int32_t dim = get_dim_feature_space();
84  if (dim > len)
85  {
86  vec = SG_REALLOC(float64_t, vec, dim);
87  memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
88  len = dim;
89  }
90 }
91 
92 template <class T>
94 {
95  T result=0;
96 
97  //result remains zero when one of the vectors is non existent
98  if (avec && bvec)
99  {
100  if (alen<=blen)
101  {
102  int32_t j=0;
103  for (int32_t i=0; i<alen; i++)
104  {
105  int32_t a_feat_idx=avec[i].feat_index;
106 
107  while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
108  j++;
109 
110  if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
111  {
112  result+= avec[i].entry * bvec[j].entry;
113  j++;
114  }
115  }
116  }
117  else
118  {
119  int32_t j=0;
120  for (int32_t i=0; i<blen; i++)
121  {
122  int32_t b_feat_idx=bvec[i].feat_index;
123 
124  while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
125  j++;
126 
127  if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
128  {
129  result+= bvec[i].entry * avec[j].entry;
130  j++;
131  }
132  }
133  }
134 
135  result*=alpha;
136  }
137 
138  return result;
139 }
140 
141 template <class T>
142 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
143 {
144  ASSERT(vec);
145  ASSERT(dim>=current_num_features);
146  T result=b;
147 
148  int32_t num_feat=current_length;
149  SGSparseVectorEntry<T>* sv=current_vector;
150 
151  if (sv)
152  {
153  for (int32_t i=0; i<num_feat; i++)
154  result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
155  }
156 
157  return result;
158 }
159 
160 template <class T>
162 {
163  ASSERT(vec2);
164  if (vec2_len < current_num_features)
165  {
166  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
167  vec2_len, current_num_features);
168  }
169 
170  float64_t result=0;
171  if (current_vector)
172  {
173  for (int32_t i=0; i<current_length; i++)
174  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
175  }
176 
177  return result;
178 }
179 
180 template <class T>
182 {
183  ASSERT(vec2);
184  if (vec2_len < current_num_features)
185  {
186  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
187  vec2_len, current_num_features);
188  }
189 
190  float32_t result=0;
191  if (current_vector)
192  {
193  for (int32_t i=0; i<current_length; i++)
194  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
195  }
196 
197  return result;
198 }
199 
200 template <class T>
201 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
202 {
203  ASSERT(vec2);
204  if (vec2_len < current_num_features)
205  {
206  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
207  vec2_len, current_num_features);
208  }
209 
210  SGSparseVectorEntry<T>* sv=current_vector;
211  int32_t num_feat=current_length;
212 
213  if (sv)
214  {
215  if (abs_val)
216  {
217  for (int32_t i=0; i<num_feat; i++)
218  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
219  }
220  else
221  {
222  for (int32_t i=0; i<num_feat; i++)
223  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
224  }
225  }
226 }
227 
228 template <class T>
229 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
230 {
231  ASSERT(vec2);
232  if (vec2_len < current_num_features)
233  {
234  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
235  vec2_len, current_num_features);
236  }
237 
238  SGSparseVectorEntry<T>* sv=current_vector;
239  int32_t num_feat=current_length;
240 
241  if (sv)
242  {
243  if (abs_val)
244  {
245  for (int32_t i=0; i<num_feat; i++)
246  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
247  }
248  else
249  {
250  for (int32_t i=0; i<num_feat; i++)
251  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
252  }
253  }
254 }
255 
256 template <class T>
258 {
259  return current_length;
260 }
261 
262 template <class T>
264 {
265  ASSERT(current_vector);
266 
267  float32_t sq=0;
268 
269  for (int32_t i=0; i<current_length; i++)
270  sq += current_vector[i].entry * current_vector[i].entry;
271 
272  return sq;
273 }
274 
275 template <class T>
277 {
278  ASSERT(current_vector);
279 
280  SGSparseVectorEntry<T>* sf_orig=current_vector;
281  int32_t len=current_length;
282 
283  int32_t* feat_idx=SG_MALLOC(int32_t, len);
284  int32_t* orig_idx=SG_MALLOC(int32_t, len);
285 
286  for (int32_t i=0; i<len; i++)
287  {
288  feat_idx[i]=sf_orig[i].feat_index;
289  orig_idx[i]=i;
290  }
291 
292  CMath::qsort_index(feat_idx, orig_idx, len);
293 
295 
296  for (int32_t i=0; i<len; i++)
297  sf_new[i]=sf_orig[orig_idx[i]];
298 
299  // sanity check
300  for (int32_t i=0; i<len-1; i++)
301  ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
302 
303  // Copy new vector back to original
304  for (int32_t i=0; i<len; i++)
305  sf_orig[i]=sf_new[i];
306 
307  SG_FREE(orig_idx);
308  SG_FREE(feat_idx);
309  SG_FREE(sf_new);
310 }
311 
312 template <class T>
314 {
315  return new CStreamingSparseFeatures<T>(*this);
316 }
317 
318 template <class T>
320 {
321  if (current_vector)
322  return 1;
323  return 0;
324 }
325 
326 template <class T>
328 {
329  return sizeof(T);
330 }
331 
333 {
334  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
335 }
336 
338 {
339  parser.set_read_vector_and_label
341 }
342 
343 #define GET_FEATURE_TYPE(f_type, sg_type) \
344 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
345 { \
346  return f_type; \
347 }
348 
351 GET_FEATURE_TYPE(F_BYTE, uint8_t)
352 GET_FEATURE_TYPE(F_BYTE, int8_t)
353 GET_FEATURE_TYPE(F_SHORT, int16_t)
354 GET_FEATURE_TYPE(F_WORD, uint16_t)
355 GET_FEATURE_TYPE(F_INT, int32_t)
356 GET_FEATURE_TYPE(F_UINT, uint32_t)
357 GET_FEATURE_TYPE(F_LONG, int64_t)
358 GET_FEATURE_TYPE(F_ULONG, uint64_t)
362 #undef GET_FEATURE_TYPE
363 
364 
365 template <class T>
366 void CStreamingSparseFeatures<T>::init()
367 {
368  working_file=NULL;
369  current_vector=NULL;
370  current_length=-1;
371  current_vec_index=0;
372  current_num_features=-1;
373 }
374 
375 template <class T>
376 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
377  bool is_labelled,
378  int32_t size)
379 {
380  init();
381  has_labels = is_labelled;
382  working_file = file;
383  parser.init(file, is_labelled, size);
384 }
385 
386 template <class T>
388 {
389  if (!parser.is_running())
390  parser.start_parser();
391 }
392 
393 template <class T>
395 {
396  parser.end_parser();
397 }
398 
399 template <class T>
401 {
402  bool ret_value;
403  ret_value = (bool) parser.get_next_example(current_vector,
404  current_length,
405  current_label);
406 
407  if (!ret_value)
408  return false;
409 
410  // Update number of features based on highest index
411  for (int32_t i=0; i<current_length; i++)
412  {
413  if (current_vector[i].feat_index > current_num_features)
414  current_num_features = current_vector[i].feat_index+1;
415  }
416  current_vec_index++;
417 
418  return true;
419 }
420 
421 template <class T>
423 {
424  current_sgvector.features=current_vector;
425  current_sgvector.num_feat_entries=current_length;
426 
427  return current_sgvector;
428 }
429 
430 template <class T>
432 {
433  ASSERT(has_labels);
434 
435  return current_label;
436 }
437 
438 template <class T>
440 {
441  parser.finalize_example();
442 }
443 
444 template <class T>
446 {
447  return current_num_features;
448 }
449 
450 template <class T>
452 {
454  return -1;
455 }
456 
457 template <class T>
459 {
460  return current_num_features;
461 }
462 
463 template <class T>
465 {
466  return current_length;
467 }
468 
469 template <class T>
471 {
472  return C_STREAMING_SPARSE;
473 }
474 
475 template class CStreamingSparseFeatures<bool>;
476 template class CStreamingSparseFeatures<char>;
477 template class CStreamingSparseFeatures<int8_t>;
478 template class CStreamingSparseFeatures<uint8_t>;
479 template class CStreamingSparseFeatures<int16_t>;
481 template class CStreamingSparseFeatures<int32_t>;
483 template class CStreamingSparseFeatures<int64_t>;
488 }

SHOGUN Machine Learning Toolbox - Documentation