SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SparseFeatures.cpp
Go to the documentation of this file.
4 #include <shogun/lib/DataType.h>
6 #include <shogun/io/SGIO.h>
7 
8 #include <string.h>
9 #include <stdlib.h>
10 
11 namespace shogun
12 {
13 
14 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
15 : CDotFeatures(size), num_vectors(0), num_features(0),
16  sparse_feature_matrix(NULL), feature_cache(NULL)
17 {
18  init();
19 }
20 
22  int32_t num_feat, int32_t num_vec, bool copy)
23 : CDotFeatures(0), num_vectors(num_vec), num_features(num_feat),
24  sparse_feature_matrix(NULL), feature_cache(NULL)
25 {
26  init();
27 
29  //SG_MALLOC(SGSparseVector<ST>, num_vec);
30  //for (int32_t i=0; i< num_vec; i++)
31  //{
32  // new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
33  // sparse_feature_matrix[i] = src[i];
34  //}
35 }
36 
38 : CDotFeatures(0), num_vectors(0), num_features(0),
39  sparse_feature_matrix(NULL), feature_cache(NULL)
40 {
41  init();
42 
44 }
45 
47 : CDotFeatures(0), num_vectors(0), num_features(0),
48  sparse_feature_matrix(NULL), feature_cache(NULL)
49 {
50  init();
51 
53 }
54 
55 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
56 : CDotFeatures(orig), num_vectors(orig.num_vectors),
57  num_features(orig.num_features),
58  sparse_feature_matrix(orig.sparse_feature_matrix),
59  feature_cache(orig.feature_cache)
60 {
61  init();
62 
63  if (orig.sparse_feature_matrix)
64  {
68  for (int32_t i=0; i< num_vectors; i++)
69  {
72 
73  }
74  }
75 
78 }
79 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
80 : CDotFeatures(loader), num_vectors(0), num_features(0),
81  sparse_feature_matrix(NULL), feature_cache(NULL)
82 {
83  init();
84 
85  load(loader);
86 }
87 
89 {
90  free_sparse_features();
91 }
93 {
94  for (int32_t i=0; i<num_vectors; i++)
95  (&sparse_feature_matrix[i])->~SGSparseVector();
96 
97  SG_FREE(sparse_feature_matrix);
98  num_vectors=0;
99  num_features=0;
100  remove_all_subsets();
101 }
103 {
104  free_sparse_feature_matrix();
105  delete feature_cache;
106  feature_cache = NULL;
107 }
108 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
109 {
110  return new CSparseFeatures<ST>(*this);
111 }
112 
113 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
114 {
115  ASSERT(index>=0 && index<num_features);
116  ASSERT(num>=0 && num<get_num_vectors());
117 
118  int32_t i;
119  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
120  ST ret = 0 ;
121 
122  if (sv.features)
123  {
124  for (i=0; i<sv.num_feat_entries; i++)
125  if (sv.features[i].feat_index==index)
126  ret+=sv.features[i].entry ;
127  }
128 
129  free_sparse_feature_vector(num);
130 
131  return ret ;
132 }
133 
134 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
135 {
136  int32_t i;
137  len=0;
138  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
139  ST* fv=NULL;
140 
141  if (sv.features)
142  {
143  len=num_features;
144  fv=SG_MALLOC(ST, num_features);
145 
146  for (i=0; i<num_features; i++)
147  fv[i]=0;
148 
149  for (i=0; i<sv.num_feat_entries; i++)
150  fv[sv.features[i].feat_index]= sv.features[i].entry;
151  }
152 
153  free_sparse_feature_vector(num);
154 
155  return fv;
156 }
157 
159 {
160  if (num>=get_num_vectors())
161  {
162  SG_ERROR("Index out of bounds (number of vectors %d, you "
163  "requested %d)\n", get_num_vectors(), num);
164  }
165 
166  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
167 
168  SGVector<ST> dense;
169 
170  if (sv.features)
171  {
172  dense=SGVector<ST>(num_features);
173  dense.zero();
174 
175  for (int32_t i=0; i<sv.num_feat_entries; i++)
176  dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
177  }
178 
179  free_sparse_feature_vector(num);
180 
181  return dense;
182 }
183 
184 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
185 {
186  SGSparseVector<ST> sv = get_sparse_feature_vector(num);
187  int32_t len=sv.num_feat_entries;
188  free_sparse_feature_vector(num);
189  return len;
190 }
191 
193 {
194  ASSERT(num<get_num_vectors());
195 
196  index_t real_num=m_subset_stack->subset_idx_conversion(num);
197 
198  SGSparseVector<ST> result;
199 
200  if (sparse_feature_matrix)
201  {
202  return sparse_feature_matrix[real_num];
203  }
204  else
205  {
206  if (feature_cache)
207  {
208  result.features=feature_cache->lock_entry(num);
209 
210  if (result.features)
211  return result;
212  else
213  {
214  result.features=feature_cache->set_entry(num);
215  }
216  }
217 
218  //if (!result.features)
219  // result.do_free=true;
220 
221  result.features=compute_sparse_feature_vector(num,
222  result.num_feat_entries, result.features);
223 
224 
225  if (get_num_preprocessors())
226  {
227  int32_t tmp_len=result.num_feat_entries;
228  SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
229  SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
230 
231  for (int32_t i=0; i<get_num_preprocessors(); i++)
232  {
233  //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
234 
235  if (i!=0) // delete feature vector, except for the the first one, i.e., feat
236  SG_FREE(tmp_feat_before);
237  tmp_feat_before=tmp_feat_after;
238  }
239 
240  memcpy(result.features, tmp_feat_after,
241  sizeof(SGSparseVectorEntry<ST>)*tmp_len);
242 
243  SG_FREE(tmp_feat_after);
244  result.num_feat_entries=tmp_len ;
245  SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
246  }
247  return result ;
248  }
249 }
250 
251 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
252 {
253  ST result=0;
254 
255  //result remains zero when one of the vectors is non existent
256  if (avec && bvec)
257  {
258  if (alen<=blen)
259  {
260  int32_t j=0;
261  for (int32_t i=0; i<alen; i++)
262  {
263  int32_t a_feat_idx=avec[i].feat_index;
264 
265  while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
266  j++;
267 
268  if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
269  {
270  result+= avec[i].entry * bvec[j].entry;
271  j++;
272  }
273  }
274  }
275  else
276  {
277  int32_t j=0;
278  for (int32_t i=0; i<blen; i++)
279  {
280  int32_t b_feat_idx=bvec[i].feat_index;
281 
282  while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
283  j++;
284 
285  if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
286  {
287  result+= bvec[i].entry * avec[j].entry;
288  j++;
289  }
290  }
291  }
292 
293  result*=alpha;
294  }
295 
296  return result;
297 }
298 
299 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
300 {
301  ASSERT(vec);
302  ASSERT(dim==num_features);
303  ST result=b;
304 
305  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
306 
307  if (sv.features)
308  {
309  for (int32_t i=0; i<sv.num_feat_entries; i++)
310  {
311  result+=alpha*vec[sv.features[i].feat_index]
312  *sv.features[i].entry;
313  }
314  }
315 
316  free_sparse_feature_vector(num);
317  return result;
318 }
319 
320 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
321 {
322  ASSERT(vec);
323  if (dim!=num_features)
324  {
325  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
326  dim, num_features);
327  }
328 
329  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
330 
331  if (sv.features)
332  {
333  if (abs_val)
334  {
335  for (int32_t i=0; i<sv.num_feat_entries; i++)
336  {
337  vec[sv.features[i].feat_index]+=alpha
338  *CMath::abs(sv.features[i].entry);
339  }
340  }
341  else
342  {
343  for (int32_t i=0; i<sv.num_feat_entries; i++)
344  {
345  vec[sv.features[i].feat_index]+=alpha
346  *sv.features[i].entry;
347  }
348  }
349  }
350 
351  free_sparse_feature_vector(num);
352 }
353 
354 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
355 {
356  if (feature_cache)
357  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
358 
359  //vec.free_vector();
360 }
361 
362 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
363 {
364  if (m_subset_stack->has_subsets())
365  SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
366 
367  num_feat=num_features;
368  num_vec=num_vectors;
369 
370  return sparse_feature_matrix;
371 }
372 
374 {
375  if (m_subset_stack->has_subsets())
376  SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
377 
378  SGSparseMatrix<ST> sm=SGSparseMatrix<ST>(NULL, 0, 0, false);
379  sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
380  return sm;
381 }
382 
384 {
385  int32_t num_feat;
386  int32_t num_vec;
387  SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
388  //SG_PRINT("num_feat = %d , num_vec = %d \n", num_feat, num_vec);
389  return new CSparseFeatures<ST>(s, num_feat, num_vec);
390 }
391 
392 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
393 {
394  num_feat=get_num_vectors();
395  num_vec=num_features;
396  //SG_PRINT("get transposed num_feat = %d , num_vec = %d \n", num_feat, num_vec);
397 
398  int32_t* hist=SG_MALLOC(int32_t, num_features);
399  memset(hist, 0, sizeof(int32_t)*num_features);
400 
401  // count how lengths of future feature vectors
402  for (int32_t v=0; v<num_feat; v++)
403  {
404  SGSparseVector<ST> sv=get_sparse_feature_vector(v);
405 
406  for (int32_t i=0; i<sv.num_feat_entries; i++)
407  hist[sv.features[i].feat_index]++;
408  }
409 
410  // allocate room for future feature vectors
412  for (int32_t v=0; v<num_vec; v++)
413  new (&sfm[v]) SGSparseVector<ST>(hist[v]);
414 
415  // fill future feature vectors with content
416  memset(hist,0,sizeof(int32_t)*num_features);
417  for (int32_t v=0; v<num_feat; v++)
418  {
419  SGSparseVector<ST> sv=get_sparse_feature_vector(v);
420 
421  for (int32_t i=0; i<sv.num_feat_entries; i++)
422  {
423  int32_t vidx=sv.features[i].feat_index;
424  int32_t fidx=v;
425  sfm[vidx].features[hist[vidx]].feat_index=fidx;
426  sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
427  hist[vidx]++;
428  }
429 
430  }
431 
432  SG_FREE(hist);
433  return sfm;
434 }
435 
437 {
438  if (m_subset_stack->has_subsets())
439  SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
440 
442  for (int32_t i=0; i<sm.num_vectors; i++)
443  {
444  new (&sparse_matrix[i]) SGSparseVector<ST>();
445  sparse_matrix[i] = sm[i];
446  }
447 
448  sparse_feature_matrix=sparse_matrix;
449  num_features=sm.num_features;
450  num_vectors=sm.num_vectors;
451 }
452 
454 {
455  SGMatrix<ST> full(num_features, get_num_vectors());
456  full.zero();
457 
458  SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
459 
460  for (int32_t v=0; v<full.num_cols; v++)
461  {
462  int32_t idx=m_subset_stack->subset_idx_conversion(v);
463  SGSparseVector<ST> current=sparse_feature_matrix[idx];
464 
465  for (int32_t f=0; f<current.num_feat_entries; f++)
466  {
467  int64_t offs=(idx*num_features)
468  +current.features[f].feat_index;
469 
470  full.matrix[offs]=current.features[f].entry;
471  }
472  }
473 
474  return full;
475 }
476 
478 {
479  remove_all_subsets();
480 
481  ST* src=full.matrix;
482  int32_t num_feat=full.num_rows;
483  int32_t num_vec=full.num_cols;
484 
485  free_sparse_feature_matrix();
486  bool result=true;
487  num_features=num_feat;
488  num_vectors=num_vec;
489 
490  SG_INFO("converting dense feature matrix to sparse one\n");
491  int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
492 
493  if (num_feat_entries)
494  {
495  int64_t num_total_entries=0;
496 
497  // count nr of non sparse features
498  for (int32_t i=0; i< num_vec; i++)
499  {
500  num_feat_entries[i]=0;
501  for (int32_t j=0; j< num_feat; j++)
502  {
503  if (src[i*((int64_t) num_feat) + j] != 0)
504  num_feat_entries[i]++;
505  }
506  }
507 
508  if (num_vec>0)
509  {
510  sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
511 
512  if (sparse_feature_matrix)
513  {
514  for (int32_t i=0; i< num_vec; i++)
515  {
516  new(&sparse_feature_matrix[i]) SGSparseVector<ST>();
517  sparse_feature_matrix[i] = SGSparseVector<ST>(num_feat_entries[i]);
518  int32_t sparse_feat_idx=0;
519 
520  for (int32_t j=0; j< num_feat; j++)
521  {
522  int64_t pos= i*num_feat + j;
523 
524  if (src[pos] != 0)
525  {
526  sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
527  sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
528  sparse_feat_idx++;
529  num_total_entries++;
530  }
531  }
532  }
533  }
534  else
535  {
536  SG_ERROR( "allocation of sparse feature matrix failed\n");
537  result=false;
538  }
539 
540  SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
541  num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
542  }
543  else
544  {
545  SG_ERROR( "huh ? zero size matrix given ?\n");
546  result=false;
547  }
548  }
549  SG_FREE(num_feat_entries);
550  return result;
551 }
552 
553 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
554 {
555  SG_INFO( "force: %d\n", force_preprocessing);
556 
557  if ( sparse_feature_matrix && get_num_preprocessors() )
558  {
559  for (int32_t i=0; i<get_num_preprocessors(); i++)
560  {
561  if ( (!is_preprocessed(i) || force_preprocessing) )
562  {
563  set_preprocessed(i);
564  SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
565  if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
566  return false;
567  }
568  return true;
569  }
570  return true;
571  }
572  else
573  {
574  SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
575  return false;
576  }
577 }
578 
579 template<class ST> int32_t CSparseFeatures<ST>::get_size() const
580 {
581  return sizeof(ST);
582 }
583 
585 {
587  ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
588 
589  return set_full_feature_matrix(fm);
590 }
591 
592 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
593 {
594  return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
595 }
596 
597 template<class ST> int32_t CSparseFeatures<ST>::get_num_features()
598 {
599  return num_features;
600 }
601 
602 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
603 {
604  int32_t n=num_features;
605  ASSERT(n<=num);
606  num_features=num;
607  return num_features;
608 }
609 
611 {
612  return C_SPARSE;
613 }
614 
615 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
616 {
617  if (feature_cache)
618  feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
619 
620  //vec.free_vector();
621 }
622 
624 {
625  int64_t num=0;
626  index_t num_vec=get_num_vectors();
627  for (int32_t i=0; i<num_vec; i++)
628  num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
629 
630  return num;
631 }
632 
634 {
635  ASSERT(sq);
636 
637  index_t num_vec=get_num_vectors();
638  for (int32_t i=0; i<num_vec; i++)
639  {
640  sq[i]=0;
641  SGSparseVector<ST> vec=get_sparse_feature_vector(i);
642 
643  for (int32_t j=0; j<vec.num_feat_entries; j++)
644  sq[i]+=vec.features[j].entry*vec.features[j].entry;
645 
646  free_feature_vector(i);
647  }
648 
649  return sq;
650 }
651 
653  CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
654  CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
655 {
656  int32_t i,j;
657  ASSERT(lhs);
658  ASSERT(rhs);
659 
662  ASSERT(avec.features);
663  ASSERT(bvec.features);
664 
665  float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
666 
667  if (avec.num_feat_entries<=bvec.num_feat_entries)
668  {
669  j=0;
670  for (i=0; i<avec.num_feat_entries; i++)
671  {
672  int32_t a_feat_idx=avec.features[i].feat_index;
673 
674  while ((j<bvec.num_feat_entries)
675  &&(bvec.features[j].feat_index<a_feat_idx))
676  j++;
677 
678  if ((j<bvec.num_feat_entries)
679  &&(bvec.features[j].feat_index==a_feat_idx))
680  {
681  result-=2*(avec.features[i].entry*bvec.features[j].entry);
682  j++;
683  }
684  }
685  }
686  else
687  {
688  j=0;
689  for (i=0; i<bvec.num_feat_entries; i++)
690  {
691  int32_t b_feat_idx=bvec.features[i].feat_index;
692 
693  while ((j<avec.num_feat_entries)
694  &&(avec.features[j].feat_index<b_feat_idx))
695  j++;
696 
697  if ((j<avec.num_feat_entries)
698  &&(avec.features[j].feat_index==b_feat_idx))
699  {
700  result-=2*(bvec.features[i].entry*avec.features[j].entry);
701  j++;
702  }
703  }
704  }
705 
706  ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
707  ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
708 
709  return CMath::abs(result);
710 }
711 
713  bool do_sort_features)
714 {
715  remove_all_subsets();
716 
717  CRegressionLabels* lab=NULL;
718 
719  size_t blocksize=1024*1024;
720  size_t required_blocksize=blocksize;
721  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
722  FILE* f=fopen(fname, "ro");
723 
724  if (f)
725  {
726  free_sparse_feature_matrix();
727  num_vectors=0;
728  num_features=0;
729 
730  SG_INFO("counting line numbers in file %s\n", fname);
731  size_t sz=blocksize;
732  size_t block_offs=0;
733  size_t old_block_offs=0;
734  fseek(f, 0, SEEK_END);
735  size_t fsize=ftell(f);
736  rewind(f);
737 
738  while (sz == blocksize)
739  {
740  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
741  for (size_t i=0; i<sz; i++)
742  {
743  block_offs++;
744  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
745  {
746  num_vectors++;
747  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
748  old_block_offs=block_offs;
749  }
750  }
751  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
752  }
753 
754  SG_INFO("found %d feature vectors\n", num_vectors);
755  SG_FREE(dummy);
756  blocksize=required_blocksize;
757  dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
758 
759  lab=new CRegressionLabels(num_vectors);
760  sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
761  for (int32_t i=0; i<num_vectors; i++)
762  new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
763  rewind(f);
764  sz=blocksize;
765  int32_t lines=0;
766  while (sz == blocksize)
767  {
768  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
769 
770  size_t old_sz=0;
771  for (size_t i=0; i<sz; i++)
772  {
773  if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
774  {
775  size_t len=i-old_sz+1;
776  uint8_t* data=&dummy[old_sz];
777 
778  for (size_t j=0; j<len; j++)
779  dummy[j]=data[j];
780 
781  sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
782  i=0;
783  old_sz=0;
784  sz+=len;
785  }
786 
787  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
788  {
789 
790  size_t len=i-old_sz;
791  uint8_t* data=&dummy[old_sz];
792 
793  int32_t dims=0;
794  for (size_t j=0; j<len; j++)
795  {
796  if (data[j]==':')
797  dims++;
798  }
799 
800  if (dims<=0)
801  {
802  SG_ERROR("Error in line %d - number of"
803  " dimensions is %d line is %d characters"
804  " long\n line_content:'%.*s'\n", lines,
805  dims, len, len, (const char*) data);
806  }
807 
809  size_t j=0;
810  for (; j<len; j++)
811  {
812  if (data[j]==' ')
813  {
814  data[j]='\0';
815 
816  lab->set_label(lines, atof((const char*) data));
817  break;
818  }
819  }
820 
821  int32_t d=0;
822  j++;
823  uint8_t* start=&data[j];
824  for (; j<len; j++)
825  {
826  if (data[j]==':')
827  {
828  data[j]='\0';
829 
830  feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
831  num_features=CMath::max(num_features, feat[d].feat_index+1);
832 
833  j++;
834  start=&data[j];
835  for (; j<len; j++)
836  {
837  if (data[j]==' ' || data[j]=='\n')
838  {
839  data[j]='\0';
840  feat[d].entry=(ST) atof((const char*) start);
841  d++;
842  break;
843  }
844  }
845 
846  if (j==len)
847  {
848  data[j]='\0';
849  feat[dims-1].entry=(ST) atof((const char*) start);
850  }
851 
852  j++;
853  start=&data[j];
854  }
855  }
856 
857  sparse_feature_matrix[lines].num_feat_entries=dims;
858  sparse_feature_matrix[lines].features=feat;
859 
860  old_sz=i+1;
861  lines++;
862  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
863  }
864  }
865  }
866  SG_INFO("file successfully read\n");
867  fclose(f);
868  }
869 
870  SG_FREE(dummy);
871 
872  if (do_sort_features)
873  sort_features();
874 
875  return lab;
876 }
877 
878 template<class ST> void CSparseFeatures<ST>::sort_features()
879 {
880  if (m_subset_stack->has_subsets())
881  SG_ERROR("sort_features() not allowed with subset\n");
882 
883  ASSERT(get_num_preprocessors()==0);
884 
885  if (!sparse_feature_matrix)
886  SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
887 
888  for (int32_t i=0; i<num_vectors; i++)
889  {
890  int32_t len=sparse_feature_matrix[i].num_feat_entries;
891 
892  if (!len)
893  continue;
894 
895  SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
896  int32_t* feat_idx=SG_MALLOC(int32_t, len);
897  int32_t* orig_idx=SG_MALLOC(int32_t, len);
898 
899  for (int j=0; j<len; j++)
900  {
901  feat_idx[j]=sf_orig[j].feat_index;
902  orig_idx[j]=j;
903  }
904 
905  CMath::qsort_index(feat_idx, orig_idx, len);
906 
908  for (int j=0; j<len; j++)
909  sf_new[j]=sf_orig[orig_idx[j]];
910 
911  sparse_feature_matrix[i].features=sf_new;
912 
913  // sanity check
914  for (int j=0; j<len-1; j++)
915  ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
916 
917  SG_FREE(orig_idx);
918  SG_FREE(feat_idx);
919  SG_FREE(sf_orig);
920  }
921 }
922 
923 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
924  CRegressionLabels* label)
925 {
926  if (m_subset_stack->has_subsets())
927  SG_ERROR("write_svmlight_file() not allowed with subset\n");
928 
929  ASSERT(label);
930  int32_t num=label->get_num_labels();
931  ASSERT(num>0);
932  ASSERT(num==num_vectors);
933 
934  FILE* f=fopen(fname, "wb");
935 
936  if (f)
937  {
938  for (int32_t i=0; i<num; i++)
939  {
940  fprintf(f, "%d ", (int32_t) label->get_int_label(i));
941 
942  SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
943  int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
944 
945  for (int32_t j=0; j<num_feat; j++)
946  {
947  if (j<num_feat-1)
948  fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
949  else
950  fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
951  }
952  }
953 
954  fclose(f);
955  return true;
956  }
957  return false;
958 }
959 
960 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
961 {
962  return num_features;
963 }
964 
965 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
966  CDotFeatures* df, int32_t vec_idx2)
967 {
968  ASSERT(df);
969  ASSERT(df->get_feature_type() == get_feature_type());
970  ASSERT(df->get_feature_class() == get_feature_class());
972 
973  SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
974  SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
975 
976  float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
977  bvec.features, bvec.num_feat_entries);
978 
979  free_sparse_feature_vector(vec_idx1);
980  sf->free_sparse_feature_vector(vec_idx2);
981 
982  return result;
983 }
984 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
985 {
986  ASSERT(vec2);
987  if (vec2_len!=num_features)
988  {
989  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
990  vec2_len, num_features);
991  }
992  float64_t result=0;
993 
994  SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
995 
996  if (sv.features)
997  {
998  for (int32_t i=0; i<sv.num_feat_entries; i++)
999  result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
1000  }
1001 
1002  free_sparse_feature_vector(vec_idx1);
1003 
1004  return result;
1005 }
1006 
1007 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
1008 {
1009  if (vector_index>=get_num_vectors())
1010  {
1011  SG_ERROR("Index out of bounds (number of vectors %d, you "
1012  "requested %d)\n", get_num_vectors(), vector_index);
1013  }
1014 
1015  if (!sparse_feature_matrix)
1016  SG_ERROR("Requires a in-memory feature matrix\n");
1017 
1018  sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
1019  it->sv=get_sparse_feature_vector(vector_index);
1020  it->index=0;
1021  it->vector_index=vector_index;
1022 
1023  return it;
1024 }
1025 
1026 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
1027 {
1028  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1029  if (!it || it->index>=it->sv.num_feat_entries)
1030  return false;
1031 
1032  int32_t i=it->index++;
1033 
1034  index=it->sv.features[i].feat_index;
1035  value=(float64_t) it->sv.features[i].entry;
1036 
1037  return true;
1038 }
1039 
1040 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
1041 {
1042  if (!iterator)
1043  return;
1044 
1045  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1046  free_sparse_feature_vector(it->vector_index);
1047  SG_FREE(it);
1048 }
1049 
1051 {
1052  SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
1053  get_dim_feature_space());
1054 
1055  for (index_t i=0; i<indices.vlen; ++i)
1056  {
1057  /* index to copy */
1058  index_t index=indices.vector[i];
1059  index_t real_index=m_subset_stack->subset_idx_conversion(index);
1060 
1061  /* copy sparse vector */
1062  SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
1063  matrix_copy.sparse_matrix[i]=current;
1064 
1065  free_sparse_feature_vector(index);
1066  }
1067 
1068  CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
1069  SG_REF(result);
1070  return result;
1071 }
1072 
1074  int32_t& len, SGSparseVectorEntry<ST>* target)
1075 {
1077 
1078  len=0;
1079  return NULL;
1080 }
1081 
1082 template<class ST> void CSparseFeatures<ST>::init()
1083 {
1084  set_generic<ST>();
1085 
1086  m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
1087  "sparse_feature_matrix",
1088  "Array of sparse vectors.");
1089  m_parameters->add(&num_features, "num_features",
1090  "Total number of features.");
1091 }
1092 
1093 #define GET_FEATURE_TYPE(sg_type, f_type) \
1094 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \
1095 { \
1096  return f_type; \
1097 }
1098 GET_FEATURE_TYPE(bool, F_BOOL)
1099 GET_FEATURE_TYPE(char, F_CHAR)
1100 GET_FEATURE_TYPE(uint8_t, F_BYTE)
1101 GET_FEATURE_TYPE(int8_t, F_BYTE)
1102 GET_FEATURE_TYPE(int16_t, F_SHORT)
1103 GET_FEATURE_TYPE(uint16_t, F_WORD)
1104 GET_FEATURE_TYPE(int32_t, F_INT)
1105 GET_FEATURE_TYPE(uint32_t, F_UINT)
1106 GET_FEATURE_TYPE(int64_t, F_LONG)
1107 GET_FEATURE_TYPE(uint64_t, F_ULONG)
1111 #undef GET_FEATURE_TYPE
1112 
1113 #define LOAD(fname, sg_type) \
1114 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
1115 { \
1116  remove_all_subsets(); \
1117  SG_SET_LOCALE_C; \
1118  ASSERT(loader); \
1119  SGSparseVector<sg_type>* matrix=NULL; \
1120  int32_t num_feat=0; \
1121  int32_t num_vec=0; \
1122  loader->fname(matrix, num_feat, num_vec); \
1123  set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
1124  SG_RESET_LOCALE; \
1125 }
1126 LOAD(get_sparse_matrix, bool)
1127 LOAD(get_sparse_matrix, char)
1128 LOAD(get_sparse_matrix, uint8_t)
1129 LOAD(get_int8_sparsematrix, int8_t)
1130 LOAD(get_sparse_matrix, int16_t)
1131 LOAD(get_sparse_matrix, uint16_t)
1132 LOAD(get_sparse_matrix, int32_t)
1133 LOAD(get_uint_sparsematrix, uint32_t)
1134 LOAD(get_long_sparsematrix, int64_t)
1135 LOAD(get_ulong_sparsematrix, uint64_t)
1136 LOAD(get_sparse_matrix, float32_t)
1137 LOAD(get_sparse_matrix, float64_t)
1138 LOAD(get_longreal_sparsematrix, floatmax_t)
1139 #undef LOAD
1140 
1141 #define WRITE(fname, sg_type) \
1142 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
1143 { \
1144  if (m_subset_stack->has_subsets()) \
1145  SG_ERROR("save() not allowed with subset\n"); \
1146  SG_SET_LOCALE_C; \
1147  ASSERT(writer); \
1148  writer->fname(sparse_feature_matrix, num_features, num_vectors); \
1149  SG_RESET_LOCALE; \
1150 }
1151 WRITE(set_sparse_matrix, bool)
1152 WRITE(set_sparse_matrix, char)
1153 WRITE(set_sparse_matrix, uint8_t)
1154 WRITE(set_int8_sparsematrix, int8_t)
1155 WRITE(set_sparse_matrix, int16_t)
1156 WRITE(set_sparse_matrix, uint16_t)
1157 WRITE(set_sparse_matrix, int32_t)
1158 WRITE(set_uint_sparsematrix, uint32_t)
1159 WRITE(set_long_sparsematrix, int64_t)
1160 WRITE(set_ulong_sparsematrix, uint64_t)
1161 WRITE(set_sparse_matrix, float32_t)
1162 WRITE(set_sparse_matrix, float64_t)
1163 WRITE(set_longreal_sparsematrix, floatmax_t)
1164 #undef WRITE
1165 
1166 template class CSparseFeatures<bool>;
1167 template class CSparseFeatures<char>;
1168 template class CSparseFeatures<int8_t>;
1169 template class CSparseFeatures<uint8_t>;
1170 template class CSparseFeatures<int16_t>;
1171 template class CSparseFeatures<uint16_t>;
1172 template class CSparseFeatures<int32_t>;
1173 template class CSparseFeatures<uint32_t>;
1174 template class CSparseFeatures<int64_t>;
1175 template class CSparseFeatures<uint64_t>;
1176 template class CSparseFeatures<float32_t>;
1177 template class CSparseFeatures<float64_t>;
1178 template class CSparseFeatures<floatmax_t>;
1179 }

SHOGUN Machine Learning Toolbox - Documentation