00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _SPARSEFEATURES__H__
00014 #define _SPARSEFEATURES__H__
00015
00016 #include <string.h>
00017 #include <stdlib.h>
00018
00019 #include "lib/common.h"
00020 #include "lib/Mathematics.h"
00021 #include "lib/Cache.h"
00022 #include "lib/io.h"
00023 #include "lib/Cache.h"
00024 #include "lib/File.h"
00025 #include "lib/DataType.h"
00026
00027 #include "features/Labels.h"
00028 #include "features/Features.h"
00029 #include "features/DotFeatures.h"
00030 #include "features/SimpleFeatures.h"
00031 #include "preproc/SparsePreProc.h"
00032
00033 namespace shogun
00034 {
00035
00036 class CFile;
00037 class CLabels;
00038 class CFeatures;
00039 class CDotFeatures;
00040 template <class ST> class CSimpleFeatures;
00041 template <class ST> class CSparsePreProc;
00042
00055 template <class ST> class CSparseFeatures : public CDotFeatures
00056 {
00057 void init(void) {
00058 set_generic<ST>();
00059
00060 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
00061 "sparse_feature_matrix",
00062 "Array of sparse vectors.");
00063 m_parameters->add(&num_features, "num_features",
00064 "Total number of features.");
00065 }
00066
00067 public:
00072 CSparseFeatures(int32_t size=0)
00073 : CDotFeatures(size), num_vectors(0), num_features(0),
00074 sparse_feature_matrix(NULL), feature_cache(NULL)
00075 { init(); }
00076
00085 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00086 : CDotFeatures(0), num_vectors(0), num_features(0),
00087 sparse_feature_matrix(NULL), feature_cache(NULL)
00088 {
00089 init();
00090
00091 if (!copy)
00092 set_sparse_feature_matrix(src, num_feat, num_vec);
00093 else
00094 {
00095 sparse_feature_matrix = new TSparse<ST>[num_vec];
00096 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00097 for (int32_t i=0; i< num_vec; i++)
00098 {
00099 sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00100 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00101
00102 }
00103 }
00104 }
00105
00113 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00114 : CDotFeatures(0), num_vectors(0), num_features(0),
00115 sparse_feature_matrix(NULL), feature_cache(NULL)
00116 {
00117 init();
00118
00119 set_full_feature_matrix(src, num_feat, num_vec);
00120 }
00121
00123 CSparseFeatures(const CSparseFeatures & orig)
00124 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00125 num_features(orig.num_features),
00126 sparse_feature_matrix(orig.sparse_feature_matrix),
00127 feature_cache(orig.feature_cache)
00128 {
00129 init();
00130
00131 if (orig.sparse_feature_matrix)
00132 {
00133 free_sparse_feature_matrix();
00134 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00135 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00136 for (int32_t i=0; i< num_vectors; i++)
00137 {
00138 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00139 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00140
00141 }
00142 }
00143 }
00144
00149 CSparseFeatures(CFile* loader)
00150 : CDotFeatures(loader), num_vectors(0), num_features(0),
00151 sparse_feature_matrix(NULL), feature_cache(NULL)
00152 {
00153 init();
00154
00155 load(loader);
00156 }
00157
00159 virtual ~CSparseFeatures()
00160 {
00161 free_sparse_features();
00162 }
00163
00167 void free_sparse_feature_matrix()
00168 {
00169 clean_tsparse(sparse_feature_matrix, num_vectors);
00170 sparse_feature_matrix = NULL;
00171 num_vectors=0;
00172 num_features=0;
00173 }
00174
00178 void free_sparse_features()
00179 {
00180 free_sparse_feature_matrix();
00181 delete feature_cache;
00182 feature_cache = NULL;
00183 }
00184
00189 virtual CFeatures* duplicate() const
00190 {
00191 return new CSparseFeatures<ST>(*this);
00192 }
00193
00201 ST get_feature(int32_t num, int32_t index)
00202 {
00203 ASSERT(index>=0 && index<num_features) ;
00204 ASSERT(num>=0 && num<num_vectors) ;
00205
00206 bool vfree;
00207 int32_t num_feat;
00208 int32_t i;
00209 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00210 ST ret = 0 ;
00211
00212 if (sv)
00213 {
00214 for (i=0; i<num_feat; i++)
00215 if (sv[i].feat_index==index)
00216 ret += sv[i].entry ;
00217 }
00218
00219 free_sparse_feature_vector(sv, num, vfree);
00220
00221 return ret ;
00222 }
00223
00224
00233 ST* get_full_feature_vector(int32_t num, int32_t& len)
00234 {
00235 bool vfree;
00236 int32_t num_feat;
00237 int32_t i;
00238 len=0;
00239 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00240 ST* fv=NULL;
00241
00242 if (sv)
00243 {
00244 len=num_features;
00245 fv=new ST[num_features];
00246
00247 for (i=0; i<num_features; i++)
00248 fv[i]=0;
00249
00250 for (i=0; i<num_feat; i++)
00251 fv[sv[i].feat_index]= sv[i].entry;
00252 }
00253
00254 free_sparse_feature_vector(sv, num, vfree);
00255
00256 return fv;
00257 }
00258
00265 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00266 {
00267 if (num>=num_vectors)
00268 {
00269 SG_ERROR("Index out of bounds (number of vectors %d, you "
00270 "requested %d)\n", num_vectors, num);
00271 }
00272
00273 bool vfree;
00274 int32_t num_feat=0;
00275 *len=0;
00276 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00277
00278 if (sv)
00279 {
00280 *len=num_features;
00281 *dst= (ST*) malloc(sizeof(ST)*num_features);
00282 memset(*dst, 0, sizeof(ST)*num_features);
00283
00284 for (int32_t i=0; i<num_feat; i++)
00285 (*dst)[sv[i].feat_index]= sv[i].entry;
00286 }
00287
00288 free_sparse_feature_vector(sv, num, vfree);
00289 }
00290
00291
00297 virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00298 {
00299 bool vfree;
00300 int32_t len;
00301 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00302 free_sparse_feature_vector(sv, num, vfree);
00303 return len;
00304 }
00305
00316 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00317 {
00318 ASSERT(num<num_vectors);
00319
00320 if (sparse_feature_matrix)
00321 {
00322 len= sparse_feature_matrix[num].num_feat_entries;
00323 vfree=false ;
00324 return sparse_feature_matrix[num].features;
00325 }
00326 else
00327 {
00328 TSparseEntry<ST>* feat=NULL;
00329 vfree=false;
00330
00331 if (feature_cache)
00332 {
00333 feat=feature_cache->lock_entry(num);
00334
00335 if (feat)
00336 return feat;
00337 else
00338 {
00339 feat=feature_cache->set_entry(num);
00340 }
00341 }
00342
00343 if (!feat)
00344 vfree=true;
00345
00346 feat=compute_sparse_feature_vector(num, len, feat);
00347
00348
00349 if (get_num_preproc())
00350 {
00351 int32_t tmp_len=len;
00352 TSparseEntry<ST>* tmp_feat_before = feat;
00353 TSparseEntry<ST>* tmp_feat_after = NULL;
00354
00355 for (int32_t i=0; i<get_num_preproc(); i++)
00356 {
00357
00358
00359 if (i!=0)
00360 delete[] tmp_feat_before;
00361 tmp_feat_before=tmp_feat_after;
00362 }
00363
00364 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00365 delete[] tmp_feat_after;
00366 len=tmp_len ;
00367 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00368 }
00369 return feat ;
00370 }
00371 }
00372
00373
00384 static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00385 {
00386 ST result=0;
00387
00388
00389 if (avec && bvec)
00390 {
00391 if (alen<=blen)
00392 {
00393 int32_t j=0;
00394 for (int32_t i=0; i<alen; i++)
00395 {
00396 int32_t a_feat_idx=avec[i].feat_index;
00397
00398 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00399 j++;
00400
00401 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00402 {
00403 result+= avec[i].entry * bvec[j].entry;
00404 j++;
00405 }
00406 }
00407 }
00408 else
00409 {
00410 int32_t j=0;
00411 for (int32_t i=0; i<blen; i++)
00412 {
00413 int32_t b_feat_idx=bvec[i].feat_index;
00414
00415 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00416 j++;
00417
00418 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00419 {
00420 result+= bvec[i].entry * avec[j].entry;
00421 j++;
00422 }
00423 }
00424 }
00425
00426 result*=alpha;
00427 }
00428
00429 return result;
00430 }
00431
00442 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00443 {
00444 ASSERT(vec);
00445 ASSERT(dim==num_features);
00446 ST result=b;
00447
00448 bool vfree;
00449 int32_t num_feat;
00450 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00451
00452 if (sv)
00453 {
00454 for (int32_t i=0; i<num_feat; i++)
00455 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00456 }
00457
00458 free_sparse_feature_vector(sv, num, vfree);
00459 return result;
00460 }
00461
00471 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00472 {
00473 ASSERT(vec);
00474 if (dim!=num_features)
00475 {
00476 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00477 dim, num_features);
00478 }
00479
00480 bool vfree;
00481 int32_t num_feat;
00482 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00483
00484 if (sv)
00485 {
00486 if (abs_val)
00487 {
00488 for (int32_t i=0; i<num_feat; i++)
00489 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00490 }
00491 else
00492 {
00493 for (int32_t i=0; i<num_feat; i++)
00494 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00495 }
00496 }
00497
00498 free_sparse_feature_vector(sv, num, vfree);
00499 }
00500
00507 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00508 {
00509 if (feature_cache)
00510 feature_cache->unlock_entry(num);
00511
00512 if (free)
00513 delete[] feat_vec ;
00514 }
00515
00523 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00524 {
00525 num_feat=num_features;
00526 num_vec=num_vectors;
00527
00528 return sparse_feature_matrix;
00529 }
00530
00539 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00540 int32_t* num_vec, int64_t* nnz)
00541 {
00542 *nnz=get_num_nonzero_entries();
00543 *num_feat=num_features;
00544 *num_vec=num_vectors;
00545 *dst=sparse_feature_matrix;
00546 }
00547
00553 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00554 {
00555 if (sfm)
00556 {
00557 for (int32_t i=0; i<num_vec; i++)
00558 delete[] sfm[i].features;
00559
00560 delete[] sfm;
00561 }
00562 }
00563
00568 CSparseFeatures<ST>* get_transposed()
00569 {
00570 int32_t num_feat;
00571 int32_t num_vec;
00572 TSparse<ST>* s=get_transposed(num_feat, num_vec);
00573 return new CSparseFeatures<ST>(s, num_feat, num_vec);
00574 }
00575
00585 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00586 {
00587 num_feat=num_vectors;
00588 num_vec=num_features;
00589
00590 int32_t* hist=new int32_t[num_features];
00591 memset(hist, 0, sizeof(int32_t)*num_features);
00592
00593
00594 for (int32_t v=0; v<num_vectors; v++)
00595 {
00596 int32_t vlen;
00597 bool vfree;
00598 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00599
00600 for (int32_t i=0; i<vlen; i++)
00601 hist[sv[i].feat_index]++;
00602
00603 free_sparse_feature_vector(sv, v, vfree);
00604 }
00605
00606
00607 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00608 for (int32_t v=0; v<num_vec; v++)
00609 {
00610 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00611 sfm[v].num_feat_entries=hist[v];
00612 sfm[v].vec_index=v;
00613 }
00614
00615
00616 memset(hist,0,sizeof(int32_t)*num_features);
00617 for (int32_t v=0; v<num_vectors; v++)
00618 {
00619 int32_t vlen;
00620 bool vfree;
00621 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00622
00623 for (int32_t i=0; i<vlen; i++)
00624 {
00625 int32_t vidx=sv[i].feat_index;
00626 int32_t fidx=v;
00627 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00628 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00629 hist[vidx]++;
00630 }
00631
00632 free_sparse_feature_vector(sv, v, vfree);
00633 }
00634
00635 delete[] hist;
00636 return sfm;
00637 }
00638
00648 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00649 {
00650 free_sparse_feature_matrix();
00651
00652 sparse_feature_matrix=src;
00653 num_features=num_feat;
00654 num_vectors=num_vec;
00655 }
00656
00664 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00665 {
00666 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00667 num_feat=num_features;
00668 num_vec=num_vectors;
00669
00670 ST* fm=new ST[num_feat*num_vec];
00671
00672 if (fm)
00673 {
00674 for (int64_t i=0; i<num_feat*num_vec; i++)
00675 fm[i]=0;
00676
00677 for (int32_t v=0; v<num_vec; v++)
00678 {
00679 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00680 {
00681 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00682 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00683 }
00684 }
00685 }
00686 else
00687 SG_ERROR( "error allocating memory for dense feature matrix\n");
00688
00689 return fm;
00690 }
00691
00699 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00700 {
00701 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00702 *num_feat=num_features;
00703 *num_vec=num_vectors;
00704
00705 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00706
00707 if (*dst)
00708 {
00709 for (int64_t i=0; i<num_features*num_vectors; i++)
00710 (*dst)[i]=0;
00711
00712 for (int32_t v=0; v<num_vectors; v++)
00713 {
00714 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00715 {
00716 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00717 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00718 }
00719 }
00720 }
00721 else
00722 SG_ERROR( "error allocating memory for dense feature matrix\n");
00723 }
00724
00734 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00735 {
00736 free_sparse_feature_matrix();
00737 bool result=true;
00738 num_features=num_feat;
00739 num_vectors=num_vec;
00740
00741 SG_INFO("converting dense feature matrix to sparse one\n");
00742 int32_t* num_feat_entries=new int[num_vectors];
00743
00744 if (num_feat_entries)
00745 {
00746 int64_t num_total_entries=0;
00747
00748
00749 for (int32_t i=0; i< num_vec; i++)
00750 {
00751 num_feat_entries[i]=0;
00752 for (int32_t j=0; j< num_feat; j++)
00753 {
00754 if (src[i*((int64_t) num_feat) + j] != 0)
00755 num_feat_entries[i]++;
00756 }
00757 }
00758
00759 if (num_vec>0)
00760 {
00761 sparse_feature_matrix=new TSparse<ST>[num_vec];
00762
00763 if (sparse_feature_matrix)
00764 {
00765 for (int32_t i=0; i< num_vec; i++)
00766 {
00767 sparse_feature_matrix[i].vec_index=i;
00768 sparse_feature_matrix[i].num_feat_entries=0;
00769 sparse_feature_matrix[i].features= NULL;
00770
00771 if (num_feat_entries[i]>0)
00772 {
00773 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00774
00775 if (!sparse_feature_matrix[i].features)
00776 {
00777 SG_INFO( "allocation of features failed\n");
00778 return false;
00779 }
00780
00781 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00782 int32_t sparse_feat_idx=0;
00783
00784 for (int32_t j=0; j< num_feat; j++)
00785 {
00786 int64_t pos= i*num_feat + j;
00787
00788 if (src[pos] != 0)
00789 {
00790 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00791 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00792 sparse_feat_idx++;
00793 num_total_entries++;
00794 }
00795 }
00796 }
00797 }
00798 }
00799 else
00800 {
00801 SG_ERROR( "allocation of sparse feature matrix failed\n");
00802 result=false;
00803 }
00804
00805 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00806 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00807 }
00808 else
00809 {
00810 SG_ERROR( "huh ? zero size matrix given ?\n");
00811 result=false;
00812 }
00813 }
00814 delete[] num_feat_entries;
00815 return result;
00816 }
00817
00823 virtual bool apply_preproc(bool force_preprocessing=false)
00824 {
00825 SG_INFO( "force: %d\n", force_preprocessing);
00826
00827 if ( sparse_feature_matrix && get_num_preproc() )
00828 {
00829 for (int32_t i=0; i<get_num_preproc(); i++)
00830 {
00831 if ( (!is_preprocessed(i) || force_preprocessing) )
00832 {
00833 set_preprocessed(i);
00834 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00835 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00836 return false;
00837 }
00838 return true;
00839 }
00840 return true;
00841 }
00842 else
00843 {
00844 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00845 return false;
00846 }
00847 }
00848
00853 virtual int32_t get_size() { return sizeof(ST); }
00854
00860 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00861 {
00862 int32_t num_feat=0;
00863 int32_t num_vec=0;
00864 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00865 ASSERT(fm && num_feat>0 && num_vec>0);
00866
00867 return set_full_feature_matrix(fm, num_feat, num_vec);
00868 }
00869
00874 virtual inline int32_t get_num_vectors() { return num_vectors; }
00875
00880 inline int32_t get_num_features() { return num_features; }
00881
00893 inline int32_t set_num_features(int32_t num)
00894 {
00895 int32_t n=num_features;
00896 ASSERT(n<=num);
00897 num_features=num;
00898 return num_features;
00899 }
00900
00905 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00906
00911 inline virtual EFeatureType get_feature_type();
00912
00919 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00920 {
00921 if (feature_cache)
00922 feature_cache->unlock_entry(num);
00923
00924 if (free)
00925 delete[] feat_vec ;
00926 }
00927
00932 int64_t get_num_nonzero_entries()
00933 {
00934 int64_t num=0;
00935 for (int32_t i=0; i<num_vectors; i++)
00936 num+=sparse_feature_matrix[i].num_feat_entries;
00937
00938 return num;
00939 }
00940
00946 float64_t* compute_squared(float64_t* sq)
00947 {
00948 ASSERT(sq);
00949
00950 int32_t len=0;
00951 bool do_free=false;
00952
00953 for (int32_t i=0; i<this->get_num_vectors(); i++)
00954 {
00955 sq[i]=0;
00956 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00957
00958 for (int32_t j=0; j<len; j++)
00959 sq[i] += vec[j].entry * vec[j].entry;
00960
00961 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00962 }
00963
00964 return sq;
00965 }
00966
00979 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00980 {
00981 int32_t i,j;
00982 int32_t alen, blen;
00983 bool afree, bfree;
00984 ASSERT(lhs);
00985 ASSERT(rhs);
00986
00987 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00988 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00989 ASSERT(avec);
00990 ASSERT(bvec);
00991
00992 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00993
00994 if (alen<=blen)
00995 {
00996 j=0;
00997 for (i=0; i<alen; i++)
00998 {
00999 int32_t a_feat_idx=avec[i].feat_index;
01000
01001 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
01002 j++;
01003
01004 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
01005 {
01006 result-=2*(avec[i].entry*bvec[j].entry);
01007 j++;
01008 }
01009 }
01010 }
01011 else
01012 {
01013 j=0;
01014 for (i=0; i<blen; i++)
01015 {
01016 int32_t b_feat_idx=bvec[i].feat_index;
01017
01018 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01019 j++;
01020
01021 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01022 {
01023 result-=2*(bvec[i].entry*avec[j].entry);
01024 j++;
01025 }
01026 }
01027 }
01028
01029 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01030 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01031
01032 return CMath::abs(result);
01033 }
01034
01039 void load(CFile* loader);
01040
01045 void save(CFile* writer);
01046
01054 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01055 {
01056 CLabels* lab=NULL;
01057
01058 size_t blocksize=1024*1024;
01059 size_t required_blocksize=blocksize;
01060 uint8_t* dummy=new uint8_t[blocksize];
01061 FILE* f=fopen(fname, "ro");
01062
01063 if (f)
01064 {
01065 free_sparse_feature_matrix();
01066 num_vectors=0;
01067 num_features=0;
01068
01069 SG_INFO("counting line numbers in file %s\n", fname);
01070 size_t sz=blocksize;
01071 size_t block_offs=0;
01072 size_t old_block_offs=0;
01073 fseek(f, 0, SEEK_END);
01074 size_t fsize=ftell(f);
01075 rewind(f);
01076
01077 while (sz == blocksize)
01078 {
01079 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01080 bool contains_cr=false;
01081 for (size_t i=0; i<sz; i++)
01082 {
01083 block_offs++;
01084 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01085 {
01086 num_vectors++;
01087 contains_cr=true;
01088 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01089 old_block_offs=block_offs;
01090 }
01091 }
01092 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01093 }
01094
01095 SG_INFO("found %d feature vectors\n", num_vectors);
01096 delete[] dummy;
01097 blocksize=required_blocksize;
01098 dummy = new uint8_t[blocksize+1];
01099
01100 lab=new CLabels(num_vectors);
01101 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01102
01103 rewind(f);
01104 sz=blocksize;
01105 int32_t lines=0;
01106 while (sz == blocksize)
01107 {
01108 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01109
01110 size_t old_sz=0;
01111 for (size_t i=0; i<sz; i++)
01112 {
01113 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01114 {
01115 size_t len=i-old_sz+1;
01116 uint8_t* data=&dummy[old_sz];
01117
01118 for (int32_t j=0; j<len; j++)
01119 dummy[j]=data[j];
01120
01121 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01122 i=0;
01123 old_sz=0;
01124 sz+=len;
01125 }
01126
01127 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01128 {
01129
01130 size_t len=i-old_sz;
01131 uint8_t* data=&dummy[old_sz];
01132
01133 int32_t dims=0;
01134 for (int32_t j=0; j<len; j++)
01135 {
01136 if (data[j]==':')
01137 dims++;
01138 }
01139
01140 if (dims<=0)
01141 {
01142 SG_ERROR("Error in line %d - number of"
01143 " dimensions is %d line is %d characters"
01144 " long\n line_content:'%.*s'\n", lines,
01145 dims, len, len, (const char*) data);
01146 }
01147
01148 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01149 int32_t j=0;
01150 for (; j<len; j++)
01151 {
01152 if (data[j]==' ')
01153 {
01154 data[j]='\0';
01155
01156 lab->set_label(lines, atof((const char*) data));
01157 break;
01158 }
01159 }
01160
01161 int32_t d=0;
01162 j++;
01163 uint8_t* start=&data[j];
01164 for (; j<len; j++)
01165 {
01166 if (data[j]==':')
01167 {
01168 data[j]='\0';
01169
01170 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01171 num_features=CMath::max(num_features, feat[d].feat_index+1);
01172
01173 j++;
01174 start=&data[j];
01175 for (; j<len; j++)
01176 {
01177 if (data[j]==' ' || data[j]=='\n')
01178 {
01179 data[j]='\0';
01180 feat[d].entry=(ST) atof((const char*) start);
01181 d++;
01182 break;
01183 }
01184 }
01185
01186 if (j==len)
01187 {
01188 data[j]='\0';
01189 feat[dims-1].entry=(ST) atof((const char*) start);
01190 }
01191
01192 j++;
01193 start=&data[j];
01194 }
01195 }
01196
01197 sparse_feature_matrix[lines].vec_index=lines;
01198 sparse_feature_matrix[lines].num_feat_entries=dims;
01199 sparse_feature_matrix[lines].features=feat;
01200
01201 old_sz=i+1;
01202 lines++;
01203 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01204 }
01205 }
01206 }
01207 SG_INFO("file successfully read\n");
01208 fclose(f);
01209 }
01210
01211 delete[] dummy;
01212
01213 if (do_sort_features)
01214 sort_features();
01215
01216 return lab;
01217 }
01218
01221 void sort_features()
01222 {
01223 ASSERT(get_num_preproc()==0);
01224
01225 if (!sparse_feature_matrix)
01226 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01227
01228 for (int32_t i=0; i<num_vectors; i++)
01229 {
01230 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01231
01232 if (!len)
01233 continue;
01234
01235 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01236 int32_t* feat_idx=new int32_t[len];
01237 int32_t* orig_idx=new int32_t[len];
01238
01239 for (int j=0; j<len; j++)
01240 {
01241 feat_idx[j]=sf_orig[j].feat_index;
01242 orig_idx[j]=j;
01243 }
01244
01245 CMath::qsort_index(feat_idx, orig_idx, len);
01246
01247 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01248 for (int j=0; j<len; j++)
01249 sf_new[j]=sf_orig[orig_idx[j]];
01250
01251 sparse_feature_matrix[i].features=sf_new;
01252
01253
01254 for (int j=0; j<len-1; j++)
01255 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01256
01257 delete[] orig_idx;
01258 delete[] feat_idx;
01259 delete[] sf_orig;
01260 }
01261 }
01262
01269 bool write_svmlight_file(char* fname, CLabels* label)
01270 {
01271 ASSERT(label);
01272 int32_t num=label->get_num_labels();
01273 ASSERT(num>0);
01274 ASSERT(num==num_vectors);
01275
01276 FILE* f=fopen(fname, "wb");
01277
01278 if (f)
01279 {
01280 for (int32_t i=0; i<num; i++)
01281 {
01282 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01283
01284 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01285 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01286
01287 for (int32_t j=0; j<num_feat; j++)
01288 {
01289 if (j<num_feat-1)
01290 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01291 else
01292 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01293 }
01294 }
01295
01296 fclose(f);
01297 return true;
01298 }
01299 return false;
01300 }
01301
01309 virtual int32_t get_dim_feature_space()
01310 {
01311 return num_features;
01312 }
01313
01321 virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
01322 {
01323 ASSERT(df);
01324 ASSERT(df->get_feature_type() == get_feature_type());
01325 ASSERT(df->get_feature_class() == get_feature_class());
01326 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01327
01328 bool afree, bfree;
01329 int32_t alen, blen;
01330 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01331 TSparseEntry<ST>* bvec=sf->get_sparse_feature_vector(vec_idx2, blen, bfree);
01332
01333 float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01334
01335 free_sparse_feature_vector(avec, vec_idx1, afree);
01336 sf->free_sparse_feature_vector(bvec, vec_idx2, bfree);
01337
01338 return result;
01339 }
01340
01347 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01348 {
01349 ASSERT(vec2);
01350 if (vec2_len!=num_features)
01351 {
01352 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01353 vec2_len, num_features);
01354 }
01355 float64_t result=0;
01356
01357 bool vfree;
01358 int32_t vlen;
01359 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01360
01361 if (sv)
01362 {
01363 for (int32_t i=0; i<vlen; i++)
01364 result+=vec2[sv[i].feat_index]*sv[i].entry;
01365 }
01366
01367 free_sparse_feature_vector(sv, vec_idx1, vfree);
01368
01369 return result;
01370 }
01371
01373 struct sparse_feature_iterator
01374 {
01376 TSparseEntry<ST>* sv;
01378 int32_t vidx;
01380 int32_t num_feat_entries;
01382 bool vfree;
01383
01385 int32_t index;
01386
01388 void print_info()
01389 {
01390 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01391 sv, vidx, num_feat_entries, index);
01392 }
01393 };
01394
01404 virtual void* get_feature_iterator(int32_t vector_index)
01405 {
01406 if (vector_index>=num_vectors)
01407 {
01408 SG_ERROR("Index out of bounds (number of vectors %d, you "
01409 "requested %d)\n", num_vectors, vector_index);
01410 }
01411
01412 if (!sparse_feature_matrix)
01413 SG_ERROR("Requires a in-memory feature matrix\n");
01414
01415 sparse_feature_iterator* it=new sparse_feature_iterator[1];
01416 it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree);
01417 it->vidx=vector_index;
01418 it->index=0;
01419
01420 return it;
01421 }
01422
01433 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01434 {
01435 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01436 if (!it || it->index >= it->num_feat_entries)
01437 return false;
01438
01439 int32_t i=it->index++;
01440
01441 index = it->sv[i].feat_index;
01442 value = (float64_t) it->sv[i].entry;
01443
01444 return true;
01445 }
01446
01452 virtual void free_feature_iterator(void* iterator)
01453 {
01454 if (!iterator)
01455 return;
01456
01457 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01458 free_sparse_feature_vector(it->sv, it->vidx, it->vfree);
01459 delete[] it;
01460 }
01461
01463 inline virtual const char* get_name() const { return "SparseFeatures"; }
01464
01465 protected:
01476 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01477 {
01478 len=0;
01479 return NULL;
01480 }
01481
01482 protected:
01483
01485 int32_t num_vectors;
01486
01488 int32_t num_features;
01489
01491 TSparse<ST>* sparse_feature_matrix;
01492
01494 CCache< TSparseEntry<ST> >* feature_cache;
01495 };
01496
01497 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01498 #define GET_FEATURE_TYPE(sg_type, f_type) \
01499 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01500 { \
01501 return f_type; \
01502 }
01503 GET_FEATURE_TYPE(bool, F_BOOL)
01504 GET_FEATURE_TYPE(char, F_CHAR)
01505 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01506 GET_FEATURE_TYPE(int8_t, F_BYTE)
01507 GET_FEATURE_TYPE(int16_t, F_SHORT)
01508 GET_FEATURE_TYPE(uint16_t, F_WORD)
01509 GET_FEATURE_TYPE(int32_t, F_INT)
01510 GET_FEATURE_TYPE(uint32_t, F_UINT)
01511 GET_FEATURE_TYPE(int64_t, F_LONG)
01512 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01513 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01514 GET_FEATURE_TYPE(float64_t, F_DREAL)
01515 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01516 #undef GET_FEATURE_TYPE
01517
01518 #define LOAD(fname, sg_type) \
01519 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader) \
01520 { \
01521 SG_SET_LOCALE_C; \
01522 ASSERT(loader); \
01523 TSparse<sg_type>* matrix=NULL; \
01524 int32_t num_feat=0; \
01525 int32_t num_vec=0; \
01526 loader->fname(matrix, num_feat, num_vec); \
01527 set_sparse_feature_matrix(matrix, num_feat, num_vec); \
01528 SG_RESET_LOCALE; \
01529 }
01530 LOAD(get_bool_sparsematrix, bool)
01531 LOAD(get_char_sparsematrix, char)
01532 LOAD(get_byte_sparsematrix, uint8_t)
01533 LOAD(get_int8_sparsematrix, int8_t)
01534 LOAD(get_short_sparsematrix, int16_t)
01535 LOAD(get_word_sparsematrix, uint16_t)
01536 LOAD(get_int_sparsematrix, int32_t)
01537 LOAD(get_uint_sparsematrix, uint32_t)
01538 LOAD(get_long_sparsematrix, int64_t)
01539 LOAD(get_ulong_sparsematrix, uint64_t)
01540 LOAD(get_shortreal_sparsematrix, float32_t)
01541 LOAD(get_real_sparsematrix, float64_t)
01542 LOAD(get_longreal_sparsematrix, floatmax_t)
01543 #undef LOAD
01544
01545 #define WRITE(fname, sg_type) \
01546 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer) \
01547 { \
01548 SG_SET_LOCALE_C; \
01549 ASSERT(writer); \
01550 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
01551 SG_RESET_LOCALE; \
01552 }
01553 WRITE(set_bool_sparsematrix, bool)
01554 WRITE(set_char_sparsematrix, char)
01555 WRITE(set_byte_sparsematrix, uint8_t)
01556 WRITE(set_int8_sparsematrix, int8_t)
01557 WRITE(set_short_sparsematrix, int16_t)
01558 WRITE(set_word_sparsematrix, uint16_t)
01559 WRITE(set_int_sparsematrix, int32_t)
01560 WRITE(set_uint_sparsematrix, uint32_t)
01561 WRITE(set_long_sparsematrix, int64_t)
01562 WRITE(set_ulong_sparsematrix, uint64_t)
01563 WRITE(set_shortreal_sparsematrix, float32_t)
01564 WRITE(set_real_sparsematrix, float64_t)
01565 WRITE(set_longreal_sparsematrix, floatmax_t)
01566 #undef WRITE
01567 #endif // DOXYGEN_SHOULD_SKIP_THIS
01568 }
01569 #endif