SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SparseFeatures.cpp
Go to the documentation of this file.
1 #include <shogun/lib/memory.h>
5 #include <shogun/lib/DataType.h>
7 #include <shogun/io/SGIO.h>
8 
9 #include <string.h>
10 #include <stdlib.h>
11 
12 namespace shogun
13 {
14 
15 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
16 : CDotFeatures(size), feature_cache(NULL)
17 {
18  init();
19 }
20 
22 : CDotFeatures(0), feature_cache(NULL)
23 {
24  init();
25 
27 }
28 
30 : CDotFeatures(0), feature_cache(NULL)
31 {
32  init();
33 
35 }
36 
37 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
38 : CDotFeatures(orig), sparse_feature_matrix(orig.sparse_feature_matrix),
39  feature_cache(orig.feature_cache)
40 {
41  init();
42 
45 }
46 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
47 : CDotFeatures(), feature_cache(NULL)
48 {
49  init();
50 
51  load(loader);
52 }
53 
55 {
56  SG_UNREF(feature_cache);
57 }
58 
59 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
60 {
61  return new CSparseFeatures<ST>(*this);
62 }
63 
64 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
65 {
66  REQUIRE(index>=0 && index<get_num_features(),
67  "get_feature(num=%d,index=%d): index exceeds [0;%d]\n",
68  num, index, get_num_features()-1);
69 
70  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
71  ST ret = sv.get_feature(index);
72 
73  free_sparse_feature_vector(num);
74  return ret;
75 }
76 
78 {
79  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
80  SGVector<ST> dense = sv.get_dense(get_num_features());
81  free_sparse_feature_vector(num);
82  return dense;
83 }
84 
85 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
86 {
87  SGSparseVector<ST> sv = get_sparse_feature_vector(num);
88  int32_t len=sv.num_feat_entries;
89  free_sparse_feature_vector(num);
90  return len;
91 }
92 
94 {
95  REQUIRE(num>=0 && num<get_num_vectors(),
96  "get_sparse_feature_vector(num=%d): num exceeds [0;%d]\n",
97  num, get_num_vectors()-1);
98  index_t real_num=m_subset_stack->subset_idx_conversion(num);
99 
100  if (sparse_feature_matrix.sparse_matrix)
101  {
102  return sparse_feature_matrix[real_num];
103  }
104  else
105  {
106  SGSparseVector<ST> result;
107  if (feature_cache)
108  {
109  result.features=feature_cache->lock_entry(num);
110 
111  if (result.features)
112  return result;
113  else
114  {
115  result.features=feature_cache->set_entry(num);
116  }
117  }
118 
119  //if (!result.features)
120  // result.do_free=true;
121 
122  result.features=compute_sparse_feature_vector(num,
123  result.num_feat_entries, result.features);
124 
125 
126  if (get_num_preprocessors())
127  {
128  int32_t tmp_len=result.num_feat_entries;
129  SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
130  SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
131 
132  for (int32_t i=0; i<get_num_preprocessors(); i++)
133  {
134  //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
135 
136  if (i!=0) // delete feature vector, except for the the first one, i.e., feat
137  SG_FREE(tmp_feat_before);
138  tmp_feat_before=tmp_feat_after;
139  }
140 
141  if (tmp_feat_after)
142  {
143  memcpy(result.features, tmp_feat_after,
144  sizeof(SGSparseVectorEntry<ST>)*tmp_len);
145 
146  SG_FREE(tmp_feat_after);
147  result.num_feat_entries=tmp_len;
148  }
149  SG_DEBUG("len: %d len2: %d\n", result.num_feat_entries, get_num_features())
150  }
151  return result ;
152  }
153 }
154 
155 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
156 {
157  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
158  ST result = sv.dense_dot(alpha,vec,dim,b);
159  free_sparse_feature_vector(num);
160  return result;
161 }
162 
163 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
164 {
165  REQUIRE(vec, "add_to_dense_vec(num=%d,dim=%d): vec must not be NULL\n",
166  num, dim);
167  REQUIRE(dim>=get_num_features(),
168  "add_to_dense_vec(num=%d,dim=%d): dim should contain number of features %d\n",
169  num, dim, get_num_features());
170 
171  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
172 
173  if (sv.features)
174  {
175  if (abs_val)
176  {
177  for (int32_t i=0; i<sv.num_feat_entries; i++)
178  {
179  vec[sv.features[i].feat_index]+=alpha
180  *CMath::abs(sv.features[i].entry);
181  }
182  }
183  else
184  {
185  for (int32_t i=0; i<sv.num_feat_entries; i++)
186  {
187  vec[sv.features[i].feat_index]+=alpha
188  *sv.features[i].entry;
189  }
190  }
191  }
192 
193  free_sparse_feature_vector(num);
194 }
195 
196 template<>
198  int32_t num, float64_t* vec, int32_t dim, bool abs_val)
199 {
201 }
202 
203 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
204 {
205  if (feature_cache)
206  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
207 
208  //vec.free_vector();
209 }
210 
212 {
213  if (m_subset_stack->has_subsets())
214  SG_ERROR("Not allowed with subset\n");
215 
216  return sparse_feature_matrix;
217 }
218 
220 {
221  if (m_subset_stack->has_subsets())
222  SG_ERROR("Not allowed with subset\n");
223 
224  return new CSparseFeatures<ST>(sparse_feature_matrix.get_transposed());
225 }
226 
228 {
229  if (m_subset_stack->has_subsets())
230  SG_ERROR("Not allowed with subset\n");
231 
232  sparse_feature_matrix=sm;
233 
234  // TODO: check should be implemented in sparse matrix class
235  for (int32_t j=0; j<get_num_vectors(); j++) {
236  SGSparseVector<ST> sv=get_sparse_feature_vector(j);
237  REQUIRE(get_num_features() >= sv.get_num_dimensions(),
238  "sparse_matrix[%d] check failed (matrix features %d >= vector dimension %d)\n",
239  j, get_num_features(), sv.get_num_dimensions());
240  }
241 }
242 
244 {
245  SGMatrix<ST> full(get_num_features(), get_num_vectors());
246  full.zero();
247 
248  SG_INFO("converting sparse features to full feature matrix of %d x %d"
249  " entries\n", sparse_feature_matrix.num_vectors, get_num_features())
250 
251  for (int32_t v=0; v<full.num_cols; v++)
252  {
253  int32_t idx=m_subset_stack->subset_idx_conversion(v);
254  SGSparseVector<ST> current=sparse_feature_matrix[idx];
255 
256  for (int32_t f=0; f<current.num_feat_entries; f++)
257  {
258  int64_t offs=(v*get_num_features())
259  +current.features[f].feat_index;
260 
261  full.matrix[offs]=current.features[f].entry;
262  }
263  }
264 
265  return full;
266 }
267 
269 {
270  free_sparse_feature_matrix();
271  SG_UNREF(feature_cache);
272 }
273 
275 {
276  sparse_feature_matrix=SGSparseMatrix<ST>();
277 }
278 
280 {
281  remove_all_subsets();
282  free_sparse_feature_matrix();
283  sparse_feature_matrix.from_dense(full);
284 }
285 
286 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
287 {
288  SG_INFO("force: %d\n", force_preprocessing)
289 
290  if ( sparse_feature_matrix.sparse_matrix && get_num_preprocessors() )
291  {
292  for (int32_t i=0; i<get_num_preprocessors(); i++)
293  {
294  if ( (!is_preprocessed(i) || force_preprocessing) )
295  {
296  set_preprocessed(i);
297  SG_INFO("preprocessing using preproc %s\n", get_preprocessor(i)->get_name())
298  if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
299  return false;
300  }
301  return true;
302  }
303  return true;
304  }
305  else
306  {
307  SG_WARNING("no sparse feature matrix available or features already preprocessed - skipping.\n")
308  return false;
309  }
310 }
311 
313 {
315  ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0)
316  set_full_feature_matrix(fm);
317 }
318 
320 {
322 }
323 
324 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
325 {
326  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : sparse_feature_matrix.num_vectors;
327 }
328 
329 template<class ST> int32_t CSparseFeatures<ST>::get_num_features() const
330 {
331  return sparse_feature_matrix.num_features;
332 }
333 
334 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
335 {
336  int32_t n=get_num_features();
337  ASSERT(n<=num)
338  sparse_feature_matrix.num_features=num;
339  return sparse_feature_matrix.num_features;
340 }
341 
343 {
344  return C_SPARSE;
345 }
346 
347 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
348 {
349  if (feature_cache)
350  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
351 
352  //vec.free_vector();
353 }
354 
356 {
357  int64_t num=0;
358  index_t num_vec=get_num_vectors();
359  for (int32_t i=0; i<num_vec; i++)
360  num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
361 
362  return num;
363 }
364 
366 {
367  ASSERT(sq)
368 
369  index_t num_vec=get_num_vectors();
370  for (int32_t i=0; i<num_vec; i++)
371  {
372  sq[i]=0;
373  SGSparseVector<ST> vec=get_sparse_feature_vector(i);
374 
375  for (int32_t j=0; j<vec.num_feat_entries; j++)
376  sq[i]+=vec.features[j].entry*vec.features[j].entry;
377 
378  free_feature_vector(i);
379  }
380 
381  return sq;
382 }
383 
385 {
387  return sq;
388 }
389 
391  CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
392  CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
393 {
394  int32_t i,j;
395  ASSERT(lhs)
396  ASSERT(rhs)
397 
400  ASSERT(avec.features)
401  ASSERT(bvec.features)
402 
403  float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
404 
405  if (avec.num_feat_entries<=bvec.num_feat_entries)
406  {
407  j=0;
408  for (i=0; i<avec.num_feat_entries; i++)
409  {
410  int32_t a_feat_idx=avec.features[i].feat_index;
411 
412  while ((j<bvec.num_feat_entries)
413  &&(bvec.features[j].feat_index<a_feat_idx))
414  j++;
415 
416  if ((j<bvec.num_feat_entries)
417  &&(bvec.features[j].feat_index==a_feat_idx))
418  {
419  result-=2*(avec.features[i].entry*bvec.features[j].entry);
420  j++;
421  }
422  }
423  }
424  else
425  {
426  j=0;
427  for (i=0; i<bvec.num_feat_entries; i++)
428  {
429  int32_t b_feat_idx=bvec.features[i].feat_index;
430 
431  while ((j<avec.num_feat_entries)
432  &&(avec.features[j].feat_index<b_feat_idx))
433  j++;
434 
435  if ((j<avec.num_feat_entries)
436  &&(avec.features[j].feat_index==b_feat_idx))
437  {
438  result-=2*(bvec.features[i].entry*avec.features[j].entry);
439  j++;
440  }
441  }
442  }
443 
444  ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
445  ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
446 
447  return CMath::abs(result);
448 }
449 
450 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
451 {
452  return get_num_features();
453 }
454 
455 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
456  CDotFeatures* df, int32_t vec_idx2)
457 {
458  ASSERT(df)
459  ASSERT(df->get_feature_type() == get_feature_type())
460  ASSERT(df->get_feature_class() == get_feature_class())
462 
463  SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
464  SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
465 
466  float64_t result = SGSparseVector<ST>::sparse_dot(avec, bvec);
467  free_sparse_feature_vector(vec_idx1);
468  sf->free_sparse_feature_vector(vec_idx2);
469 
470  return result;
471 }
472 
473 template<> float64_t CSparseFeatures<complex128_t>::dot(int32_t vec_idx1,
474  CDotFeatures* df, int32_t vec_idx2)
475 {
477  return 0.0;
478 }
479 
480 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
481 {
482  REQUIRE(vec2, "dense_dot(vec_idx1=%d,vec2_len=%d): vec2 must not be NULL\n",
483  vec_idx1, vec2_len);
484  REQUIRE(vec2_len>=get_num_features(),
485  "dense_dot(vec_idx1=%d,vec2_len=%d): vec2_len should contain number of features %d %d\n",
486  vec_idx1, vec2_len, get_num_features());
487 
488  float64_t result=0;
489  SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
490 
491  if (sv.features)
492  {
493  REQUIRE(get_num_features() >= sv.get_num_dimensions(),
494  "sparse_matrix[%d] check failed (matrix features %d >= vector dimension %d)\n",
495  vec_idx1, get_num_features(), sv.get_num_dimensions());
496 
497  REQUIRE(vec2_len >= sv.get_num_dimensions(),
498  "sparse_matrix[%d] check failed (dense vector dimension %d >= vector dimension %d)\n",
499  vec_idx1, vec2_len, sv.get_num_dimensions());
500 
501  for (int32_t i=0; i<sv.num_feat_entries; i++)
502  result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
503  }
504 
505  free_sparse_feature_vector(vec_idx1);
506 
507  return result;
508 }
509 
511  const float64_t* vec2, int32_t vec2_len)
512 {
514  return 0.0;
515 }
516 
517 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
518 {
519  if (vector_index>=get_num_vectors())
520  {
521  SG_ERROR("Index out of bounds (number of vectors %d, you "
522  "requested %d)\n", get_num_vectors(), vector_index);
523  }
524 
525  if (!sparse_feature_matrix.sparse_matrix)
526  SG_ERROR("Requires a in-memory feature matrix\n")
527 
528  sparse_feature_iterator* it=new sparse_feature_iterator();
529  it->sv=get_sparse_feature_vector(vector_index);
530  it->index=0;
531  it->vector_index=vector_index;
532 
533  return it;
534 }
535 
536 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
537 {
538  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
539  if (!it || it->index>=it->sv.num_feat_entries)
540  return false;
541 
542  int32_t i=it->index++;
543 
544  index=it->sv.features[i].feat_index;
545  value=(float64_t) it->sv.features[i].entry;
546 
547  return true;
548 }
549 
550 template<> bool CSparseFeatures<complex128_t>::get_next_feature(int32_t& index,
551  float64_t& value, void* iterator)
552 {
554  return false;
555 }
556 
557 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
558 {
559  if (!iterator)
560  return;
561 
562  delete ((sparse_feature_iterator*) iterator);
563 }
564 
566 {
567  SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(get_dim_feature_space(),
568  indices.vlen);
569 
570  for (index_t i=0; i<indices.vlen; ++i)
571  {
572  /* index to copy */
573  index_t index=indices.vector[i];
574  index_t real_index=m_subset_stack->subset_idx_conversion(index);
575 
576  /* copy sparse vector */
577  SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
578  matrix_copy.sparse_matrix[i]=current;
579 
580  free_sparse_feature_vector(index);
581  }
582 
583  CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
584  return result;
585 }
586 
588  int32_t& len, SGSparseVectorEntry<ST>* target)
589 {
591 
592  len=0;
593  return NULL;
594 }
595 
596 template<class ST> void CSparseFeatures<ST>::sort_features()
597 {
598  sparse_feature_matrix.sort_features();
599 }
600 
601 template<class ST> void CSparseFeatures<ST>::init()
602 {
603  set_generic<ST>();
604 
605  m_parameters->add_vector(&sparse_feature_matrix.sparse_matrix, &sparse_feature_matrix.num_vectors,
606  "sparse_feature_matrix",
607  "Array of sparse vectors.");
608  m_parameters->add(&sparse_feature_matrix.num_features, "sparse_feature_matrix.num_features",
609  "Total number of features.");
610 }
611 
612 #define GET_FEATURE_TYPE(sg_type, f_type) \
613 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \
614 { \
615  return f_type; \
616 }
619 GET_FEATURE_TYPE(uint8_t, F_BYTE)
620 GET_FEATURE_TYPE(int8_t, F_BYTE)
621 GET_FEATURE_TYPE(int16_t, F_SHORT)
622 GET_FEATURE_TYPE(uint16_t, F_WORD)
623 GET_FEATURE_TYPE(int32_t, F_INT)
624 GET_FEATURE_TYPE(uint32_t, F_UINT)
625 GET_FEATURE_TYPE(int64_t, F_LONG)
626 GET_FEATURE_TYPE(uint64_t, F_ULONG)
631 #undef GET_FEATURE_TYPE
632 
633 template<class ST> void CSparseFeatures<ST>::load(CFile* loader)
634 {
635  remove_all_subsets();
636  ASSERT(loader)
637  free_sparse_feature_matrix();
638  sparse_feature_matrix.load(loader);
639 }
640 
642 {
643  remove_all_subsets();
644  ASSERT(loader)
645  free_sparse_feature_matrix();
646  return sparse_feature_matrix.load_with_labels(loader);
647 }
648 
649 template<class ST> void CSparseFeatures<ST>::save(CFile* writer)
650 {
651  if (m_subset_stack->has_subsets())
652  SG_ERROR("Not allowed with subset\n");
653  ASSERT(writer)
654  sparse_feature_matrix.save(writer);
655 }
656 
658 {
659  if (m_subset_stack->has_subsets())
660  SG_ERROR("Not allowed with subset\n");
661  ASSERT(writer)
662  sparse_feature_matrix.save_with_labels(writer, labels);
663 }
664 
665 template class CSparseFeatures<bool>;
666 template class CSparseFeatures<char>;
667 template class CSparseFeatures<int8_t>;
668 template class CSparseFeatures<uint8_t>;
669 template class CSparseFeatures<int16_t>;
670 template class CSparseFeatures<uint16_t>;
671 template class CSparseFeatures<int32_t>;
672 template class CSparseFeatures<uint32_t>;
673 template class CSparseFeatures<int64_t>;
674 template class CSparseFeatures<uint64_t>;
675 template class CSparseFeatures<float32_t>;
676 template class CSparseFeatures<float64_t>;
677 template class CSparseFeatures<floatmax_t>;
678 template class CSparseFeatures<complex128_t>;
679 }

SHOGUN Machine Learning Toolbox - Documentation