SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingSparseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
11 namespace shogun
12 {
13 
14 template <class T>
16 {
18  init();
19 }
20 
21 template <class T>
23  bool is_labelled,
24  int32_t size)
26 {
28  init(file, is_labelled, size);
29 }
30 
31 template <class T>
33 {
34  if (parser.is_running())
35  parser.end_parser();
36 }
37 
38 template <class T>
40 {
41  ASSERT(index>=0 && index<current_num_features)
42  return current_sgvector.get_feature(index);
43 }
44 
45 template <class T>
47 {
48 }
49 
50 template <class T>
52 {
53  int32_t n=current_num_features;
54  ASSERT(n<=num)
55  current_num_features=num;
56  return n;
57 }
58 
59 template <class T>
61 {
62  int32_t dim = get_dim_feature_space();
63  if (dim > len)
64  {
65  vec = SG_REALLOC(float32_t, vec, len, dim);
66  memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
67  len = dim;
68  }
69 }
70 
71 template <class T>
73 {
74  int32_t dim = get_dim_feature_space();
75  if (dim > len)
76  {
77  vec = SG_REALLOC(float64_t, vec, len, dim);
78  memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
79  len = dim;
80  }
81 }
82 
83 template <class T>
85 {
86  T result=0;
87 
88  //result remains zero when one of the vectors is non existent
89  if (avec && bvec)
90  {
91  SGSparseVector<T> asv(avec, alen, false);
92  SGSparseVector<T> bsv(bvec, blen, false);
93 
94  result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv);
95  }
96 
97  return result;
98 }
99 
100 template <class T>
101 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
102 {
103  ASSERT(vec)
104  ASSERT(dim>=current_num_features)
105 
106  return current_sgvector.dense_dot(alpha, vec, dim, b);
107 }
108 
109 template <class T>
111 {
112  ASSERT(vec2)
113  if (vec2_len < current_num_features)
114  {
115  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
116  vec2_len, current_num_features);
117  }
118 
119  int32_t current_length = current_sgvector.num_feat_entries;
120  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
121 
122  float64_t result=0;
123  if (current_vector)
124  {
125  for (int32_t i=0; i<current_length; i++)
126  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
127  }
128 
129  return result;
130 }
131 
132 template <class T>
134 {
135  ASSERT(vec2)
136  if (vec2_len < current_num_features)
137  {
138  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
139  vec2_len, current_num_features);
140  }
141 
142  int32_t current_length = current_sgvector.num_feat_entries;
143  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
144 
145  float32_t result=0;
146  if (current_vector)
147  {
148  for (int32_t i=0; i<current_length; i++)
149  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
150  }
151 
152  return result;
153 }
154 
155 template <class T>
156 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
157 {
158  ASSERT(vec2)
159  if (vec2_len < current_num_features)
160  {
161  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
162  vec2_len, current_num_features);
163  }
164 
165  SGSparseVectorEntry<T>* sv=current_sgvector.features;
166  int32_t num_feat=current_sgvector.num_feat_entries;
167 
168  if (sv)
169  {
170  if (abs_val)
171  {
172  for (int32_t i=0; i<num_feat; i++)
173  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
174  }
175  else
176  {
177  for (int32_t i=0; i<num_feat; i++)
178  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
179  }
180  }
181 }
182 
183 template <class T>
184 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
185 {
186  ASSERT(vec2)
187  if (vec2_len < current_num_features)
188  {
189  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
190  vec2_len, current_num_features);
191  }
192 
193  SGSparseVectorEntry<T>* sv=current_sgvector.features;
194  int32_t num_feat=current_sgvector.num_feat_entries;
195 
196  if (sv)
197  {
198  if (abs_val)
199  {
200  for (int32_t i=0; i<num_feat; i++)
201  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
202  }
203  else
204  {
205  for (int32_t i=0; i<num_feat; i++)
206  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
207  }
208  }
209 }
210 
211 template <class T>
213 {
214  return current_sgvector.num_feat_entries;
215 }
216 
217 template <class T>
219 {
220  int32_t current_length = current_sgvector.num_feat_entries;
221  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
222 
223  ASSERT(current_vector)
224 
225  float32_t sq=0;
226 
227  for (int32_t i=0; i<current_length; i++)
228  sq += current_vector[i].entry * current_vector[i].entry;
229 
230  return sq;
231 }
232 
233 template <class T>
235 {
236  SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;
237 
238  // setting false to disallow reallocation
239  // and guarantee stable get_vector().features pointer
240  get_vector().sort_features(true);
241 
242  ASSERT(old_ptr == current_sgvector.features);
243 }
244 
245 template <class T>
247 {
248  return new CStreamingSparseFeatures<T>(*this);
249 }
250 
251 template <class T>
253 {
254  if (current_sgvector.features)
255  return 1;
256  return 0;
257 }
258 
260 {
261  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
262 }
263 
265 {
266  parser.set_read_vector_and_label
268 }
269 
270 #define GET_FEATURE_TYPE(f_type, sg_type) \
271 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
272 { \
273  return f_type; \
274 }
275 
278 GET_FEATURE_TYPE(F_BYTE, uint8_t)
279 GET_FEATURE_TYPE(F_BYTE, int8_t)
280 GET_FEATURE_TYPE(F_SHORT, int16_t)
281 GET_FEATURE_TYPE(F_WORD, uint16_t)
282 GET_FEATURE_TYPE(F_INT, int32_t)
283 GET_FEATURE_TYPE(F_UINT, uint32_t)
284 GET_FEATURE_TYPE(F_LONG, int64_t)
285 GET_FEATURE_TYPE(F_ULONG, uint64_t)
289 #undef GET_FEATURE_TYPE
290 
291 
292 template <class T>
293 void CStreamingSparseFeatures<T>::init()
294 {
295  working_file=NULL;
296  current_vec_index=0;
297  current_num_features=-1;
298 
299  set_generic<T>();
300 }
301 
302 template <class T>
303 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
304  bool is_labelled,
305  int32_t size)
306 {
307  init();
308  has_labels = is_labelled;
309  working_file = file;
310  SG_REF(working_file);
311  parser.init(file, is_labelled, size);
312  parser.set_free_vector_after_release(false);
313 }
314 
315 template <class T>
317 {
318  if (!parser.is_running())
319  parser.start_parser();
320 }
321 
322 template <class T>
324 {
325  parser.end_parser();
326 }
327 
328 template <class T>
330 {
331  int32_t current_length = 0;
332  SGSparseVectorEntry<T>* current_vector = NULL;
333 
334  bool ret_value;
335  ret_value = (bool) parser.get_next_example(current_vector,
336  current_length,
337  current_label);
338 
339  if (!ret_value)
340  return false;
341 
342  // ref_count disabled, because parser still owns the memory
343  current_sgvector = SGSparseVector<T>(current_vector, current_length, false);
344 
345  // Update number of features based on highest index
346  int32_t current_dimension = get_vector().get_num_dimensions();
347  current_num_features = CMath::max(current_num_features, current_dimension);
348 
349  current_vec_index++;
350  return true;
351 }
352 
353 template <class T>
355 {
356  return current_sgvector;
357 }
358 
359 template <class T>
361 {
362  ASSERT(has_labels)
363 
364  return current_label;
365 }
366 
367 template <class T>
369 {
370  parser.finalize_example();
371 }
372 
373 template <class T>
375 {
376  return current_num_features;
377 }
378 
379 template <class T>
381 {
383  return -1;
384 }
385 
386 template <class T>
388 {
389  return current_num_features;
390 }
391 
392 template <class T>
394 {
395  return current_sgvector.num_feat_entries;
396 }
397 
398 template <class T>
400 {
401  return C_STREAMING_SPARSE;
402 }
403 
404 template class CStreamingSparseFeatures<bool>;
405 template class CStreamingSparseFeatures<char>;
406 template class CStreamingSparseFeatures<int8_t>;
407 template class CStreamingSparseFeatures<uint8_t>;
408 template class CStreamingSparseFeatures<int16_t>;
410 template class CStreamingSparseFeatures<int32_t>;
412 template class CStreamingSparseFeatures<int64_t>;
417 }

SHOGUN Machine Learning Toolbox - Documentation