00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #ifndef _SPARSEFEATURES__H__
00015 #define _SPARSEFEATURES__H__
00016
00017 #include <string.h>
00018 #include <stdlib.h>
00019
00020 #include <shogun/lib/common.h>
00021 #include <shogun/mathematics/Math.h>
00022 #include <shogun/lib/Cache.h>
00023 #include <shogun/io/SGIO.h>
00024 #include <shogun/lib/Cache.h>
00025 #include <shogun/io/File.h>
00026 #include <shogun/lib/DataType.h>
00027
00028 #include <shogun/features/Labels.h>
00029 #include <shogun/features/Features.h>
00030 #include <shogun/features/DotFeatures.h>
00031 #include <shogun/features/SimpleFeatures.h>
00032 #include <shogun/preprocessor/SparsePreprocessor.h>
00033
00034 namespace shogun
00035 {
00036
00037 class CFile;
00038 class CLabels;
00039 class CFeatures;
00040 class CDotFeatures;
00041 template <class ST> class CSimpleFeatures;
00042 template <class ST> class CSparsePreprocessor;
00043
00061 template <class ST> class CSparseFeatures : public CDotFeatures
00062 {
00063 public:
00068 CSparseFeatures(int32_t size=0)
00069 : CDotFeatures(size), num_vectors(0), num_features(0),
00070 sparse_feature_matrix(NULL), feature_cache(NULL)
00071 { init(); }
00072
00081 CSparseFeatures(SGSparseVector<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00082 : CDotFeatures(0), num_vectors(0), num_features(0),
00083 sparse_feature_matrix(NULL), feature_cache(NULL)
00084 {
00085 init();
00086
00087 if (!copy)
00088 set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec));
00089 else
00090 {
00091 sparse_feature_matrix = SG_MALLOC(SGSparseVector<ST>, num_vec);
00092 memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec);
00093 for (int32_t i=0; i< num_vec; i++)
00094 {
00095 sparse_feature_matrix[i].features = SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00096 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00097
00098 }
00099 }
00100 }
00101
00107 CSparseFeatures(SGSparseMatrix<ST> sparse)
00108 : CDotFeatures(0), num_vectors(0), num_features(0),
00109 sparse_feature_matrix(NULL), feature_cache(NULL)
00110 {
00111 init();
00112
00113 set_sparse_feature_matrix(sparse);
00114 }
00115
00121 CSparseFeatures(SGMatrix<ST> dense)
00122 : CDotFeatures(0), num_vectors(0), num_features(0),
00123 sparse_feature_matrix(NULL), feature_cache(NULL)
00124 {
00125 init();
00126
00127 set_full_feature_matrix(dense);
00128 }
00129
00131 CSparseFeatures(const CSparseFeatures & orig)
00132 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00133 num_features(orig.num_features),
00134 sparse_feature_matrix(orig.sparse_feature_matrix),
00135 feature_cache(orig.feature_cache)
00136 {
00137 init();
00138
00139 if (orig.sparse_feature_matrix)
00140 {
00141 free_sparse_feature_matrix();
00142 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00143 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors);
00144 for (int32_t i=0; i< num_vectors; i++)
00145 {
00146 sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00147 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00148
00149 }
00150 }
00151
00152 m_subset=orig.m_subset->duplicate();
00153 }
00154
00159 CSparseFeatures(CFile* loader)
00160 : CDotFeatures(loader), num_vectors(0), num_features(0),
00161 sparse_feature_matrix(NULL), feature_cache(NULL)
00162 {
00163 init();
00164
00165 load(loader);
00166 }
00167
00169 virtual ~CSparseFeatures()
00170 {
00171 free_sparse_features();
00172 }
00173
00178 void free_sparse_feature_matrix()
00179 {
00180 clean_tsparse(sparse_feature_matrix, num_vectors);
00181 sparse_feature_matrix = NULL;
00182 num_vectors=0;
00183 num_features=0;
00184 remove_subset();
00185 }
00186
00191 void free_sparse_features()
00192 {
00193 free_sparse_feature_matrix();
00194 delete feature_cache;
00195 feature_cache = NULL;
00196 }
00197
00202 virtual CFeatures* duplicate() const
00203 {
00204 return new CSparseFeatures<ST>(*this);
00205 }
00206
00216 ST get_feature(int32_t num, int32_t index)
00217 {
00218 ASSERT(index>=0 && index<num_features) ;
00219 ASSERT(num>=0 && num<get_num_vectors()) ;
00220
00221 int32_t i;
00222 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00223 ST ret = 0 ;
00224
00225 if (sv.features)
00226 {
00227 for (i=0; i<sv.num_feat_entries; i++)
00228 if (sv.features[i].feat_index==index)
00229 ret+=sv.features[i].entry ;
00230 }
00231
00232 free_sparse_feature_vector(sv, num);
00233
00234 return ret ;
00235 }
00236
00237
00246 ST* get_full_feature_vector(int32_t num, int32_t& len)
00247 {
00248 int32_t i;
00249 len=0;
00250 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00251 ST* fv=NULL;
00252
00253 if (sv.features)
00254 {
00255 len=num_features;
00256 fv=SG_MALLOC(ST, num_features);
00257
00258 for (i=0; i<num_features; i++)
00259 fv[i]=0;
00260
00261 for (i=0; i<sv.num_feat_entries; i++)
00262 fv[sv.features[i].feat_index]= sv.features[i].entry;
00263 }
00264
00265 free_sparse_feature_vector(sv, num);
00266
00267 return fv;
00268 }
00269
00275 SGVector<ST> get_full_feature_vector(int32_t num)
00276 {
00277 if (num>=num_vectors)
00278 {
00279 SG_ERROR("Index out of bounds (number of vectors %d, you "
00280 "requested %d)\n", num_vectors, num);
00281 }
00282
00283 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00284
00285 SGVector<ST> dense;
00286
00287 if (sv.features)
00288 {
00289 dense.do_free=true;
00290 dense.vlen=num_features;
00291 dense.vector=SG_MALLOC(ST, num_features);
00292 memset(dense.vector, 0, sizeof(ST)*num_features);
00293
00294 for (int32_t i=0; i<sv.num_feat_entries; i++)
00295 {
00296 dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00297 }
00298 }
00299
00300 free_sparse_feature_vector(sv, num);
00301
00302 return dense;
00303 }
00304
00305
00311 virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00312 {
00313 SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00314 int32_t len=sv.num_feat_entries;
00315 free_sparse_feature_vector(sv, num);
00316 return len;
00317 }
00318
00328 SGSparseVector<ST> get_sparse_feature_vector(int32_t num)
00329 {
00330 ASSERT(num<get_num_vectors());
00331
00332 index_t real_num=subset_idx_conversion(num);
00333
00334 SGSparseVector<ST> result;
00335
00336 if (sparse_feature_matrix)
00337 {
00338 result=sparse_feature_matrix[real_num];
00339 result.do_free=false;
00340 return result;
00341 }
00342 else
00343 {
00344 result.do_free=false;
00345
00346 if (feature_cache)
00347 {
00348 result.features=feature_cache->lock_entry(num);
00349
00350 if (result.features)
00351 return result;
00352 else
00353 {
00354 result.features=feature_cache->set_entry(num);
00355 }
00356 }
00357
00358 if (!result.features)
00359 result.do_free=true;
00360
00361 result.features=compute_sparse_feature_vector(num,
00362 result.num_feat_entries, result.features);
00363
00364
00365 if (get_num_preprocessors())
00366 {
00367 int32_t tmp_len=result.num_feat_entries;
00368 SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00369 SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00370
00371 for (int32_t i=0; i<get_num_preprocessors(); i++)
00372 {
00373
00374
00375 if (i!=0)
00376 SG_FREE(tmp_feat_before);
00377 tmp_feat_before=tmp_feat_after;
00378 }
00379
00380 memcpy(result.features, tmp_feat_after,
00381 sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00382
00383 SG_FREE(tmp_feat_after);
00384 result.num_feat_entries=tmp_len ;
00385 SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00386 }
00387 result.vec_index=num;
00388 return result ;
00389 }
00390 }
00391
00392
00403 static ST sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
00404 {
00405 ST result=0;
00406
00407
00408 if (avec && bvec)
00409 {
00410 if (alen<=blen)
00411 {
00412 int32_t j=0;
00413 for (int32_t i=0; i<alen; i++)
00414 {
00415 int32_t a_feat_idx=avec[i].feat_index;
00416
00417 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00418 j++;
00419
00420 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00421 {
00422 result+= avec[i].entry * bvec[j].entry;
00423 j++;
00424 }
00425 }
00426 }
00427 else
00428 {
00429 int32_t j=0;
00430 for (int32_t i=0; i<blen; i++)
00431 {
00432 int32_t b_feat_idx=bvec[i].feat_index;
00433
00434 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00435 j++;
00436
00437 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00438 {
00439 result+= bvec[i].entry * avec[j].entry;
00440 j++;
00441 }
00442 }
00443 }
00444
00445 result*=alpha;
00446 }
00447
00448 return result;
00449 }
00450
00463 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00464 {
00465 ASSERT(vec);
00466 ASSERT(dim==num_features);
00467 ST result=b;
00468
00469 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00470
00471 if (sv.features)
00472 {
00473 for (int32_t i=0; i<sv.num_feat_entries; i++)
00474 {
00475 result+=alpha*vec[sv.features[i].feat_index]
00476 *sv.features[i].entry;
00477 }
00478 }
00479
00480 free_sparse_feature_vector(sv, num);
00481 return result;
00482 }
00483
00495 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00496 {
00497 ASSERT(vec);
00498 if (dim!=num_features)
00499 {
00500 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00501 dim, num_features);
00502 }
00503
00504 SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00505
00506 if (sv.features)
00507 {
00508 if (abs_val)
00509 {
00510 for (int32_t i=0; i<sv.num_feat_entries; i++)
00511 {
00512 vec[sv.features[i].feat_index]+=alpha
00513 *CMath::abs(sv.features[i].entry);
00514 }
00515 }
00516 else
00517 {
00518 for (int32_t i=0; i<sv.num_feat_entries; i++)
00519 {
00520 vec[sv.features[i].feat_index]+=alpha
00521 *sv.features[i].entry;
00522 }
00523 }
00524 }
00525
00526 free_sparse_feature_vector(sv, num);
00527 }
00528
00536 void free_sparse_feature_vector(SGSparseVector<ST> vec, int32_t num)
00537 {
00538 if (feature_cache)
00539 feature_cache->unlock_entry(subset_idx_conversion(num));
00540
00541 vec.free_vector();
00542 }
00543
00553 SGSparseVector<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00554 {
00555 if (m_subset)
00556 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00557
00558 num_feat=num_features;
00559 num_vec=num_vectors;
00560
00561 return sparse_feature_matrix;
00562 }
00563
00571 SGSparseMatrix<ST> get_sparse_feature_matrix()
00572 {
00573 if (m_subset)
00574 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00575
00576 SGSparseMatrix<ST> sm;
00577 sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00578 return sm;
00579 }
00580
00586 static void clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec)
00587 {
00588 if (sfm)
00589 {
00590 for (int32_t i=0; i<num_vec; i++)
00591 SG_FREE(sfm[i].features);
00592
00593 SG_FREE(sfm);
00594 }
00595 }
00596
00603 CSparseFeatures<ST>* get_transposed()
00604 {
00605 int32_t num_feat;
00606 int32_t num_vec;
00607 SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00608 return new CSparseFeatures<ST>(s, num_feat, num_vec);
00609 }
00610
00622 SGSparseVector<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00623 {
00624 num_feat=get_num_vectors();
00625 num_vec=num_features;
00626
00627 int32_t* hist=SG_MALLOC(int32_t, num_features);
00628 memset(hist, 0, sizeof(int32_t)*num_features);
00629
00630
00631 for (int32_t v=0; v<num_feat; v++)
00632 {
00633 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00634
00635 for (int32_t i=0; i<sv.num_feat_entries; i++)
00636 hist[sv.features[i].feat_index]++;
00637
00638 free_sparse_feature_vector(sv, v);
00639 }
00640
00641
00642 SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00643 for (int32_t v=0; v<num_vec; v++)
00644 {
00645 sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]);
00646 sfm[v].num_feat_entries=hist[v];
00647 sfm[v].vec_index=v;
00648 }
00649
00650
00651 memset(hist,0,sizeof(int32_t)*num_features);
00652 for (int32_t v=0; v<num_feat; v++)
00653 {
00654 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00655
00656 for (int32_t i=0; i<sv.num_feat_entries; i++)
00657 {
00658 int32_t vidx=sv.features[i].feat_index;
00659 int32_t fidx=v;
00660 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00661 sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00662 hist[vidx]++;
00663 }
00664
00665 free_sparse_feature_vector(sv, v);
00666 }
00667
00668 SG_FREE(hist);
00669 return sfm;
00670 }
00671
00679 void set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00680 {
00681 if (m_subset)
00682 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00683
00684
00685 free_sparse_feature_matrix();
00686 sm.own_matrix();
00687
00688 sparse_feature_matrix=sm.sparse_matrix;
00689 num_features=sm.num_features;
00690 num_vectors=sm.num_vectors;
00691 }
00692
00699 SGMatrix<ST> get_full_feature_matrix()
00700 {
00701 SGMatrix<ST> full;
00702
00703 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00704 full.num_rows=num_features;
00705 full.num_cols=get_num_vectors();
00706 full.do_free=true;
00707 full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors());
00708
00709 memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST));
00710
00711 for (int32_t v=0; v<full.num_cols; v++)
00712 {
00713 SGSparseVector<ST> current=
00714 sparse_feature_matrix[subset_idx_conversion(v)];
00715
00716 for (int32_t f=0; f<current.num_feat_entries; f++)
00717 {
00718 int64_t offs=(current.vec_index*num_features)
00719 +current.features[f].feat_index;
00720
00721 full.matrix[offs]=current.features[f].entry;
00722 }
00723 }
00724
00725 return full;
00726 }
00727
00737 virtual bool set_full_feature_matrix(SGMatrix<ST> full)
00738 {
00739 remove_subset();
00740
00741 ST* src=full.matrix;
00742 int32_t num_feat=full.num_rows;
00743 int32_t num_vec=full.num_cols;
00744
00745 free_sparse_feature_matrix();
00746 bool result=true;
00747 num_features=num_feat;
00748 num_vectors=num_vec;
00749
00750 SG_INFO("converting dense feature matrix to sparse one\n");
00751 int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00752
00753 if (num_feat_entries)
00754 {
00755 int64_t num_total_entries=0;
00756
00757
00758 for (int32_t i=0; i< num_vec; i++)
00759 {
00760 num_feat_entries[i]=0;
00761 for (int32_t j=0; j< num_feat; j++)
00762 {
00763 if (src[i*((int64_t) num_feat) + j] != 0)
00764 num_feat_entries[i]++;
00765 }
00766 }
00767
00768 if (num_vec>0)
00769 {
00770 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00771
00772 if (sparse_feature_matrix)
00773 {
00774 for (int32_t i=0; i< num_vec; i++)
00775 {
00776 sparse_feature_matrix[i].vec_index=i;
00777 sparse_feature_matrix[i].num_feat_entries=0;
00778 sparse_feature_matrix[i].features= NULL;
00779
00780 if (num_feat_entries[i]>0)
00781 {
00782 sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]);
00783
00784 if (!sparse_feature_matrix[i].features)
00785 {
00786 SG_INFO( "allocation of features failed\n");
00787 return false;
00788 }
00789
00790 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00791 int32_t sparse_feat_idx=0;
00792
00793 for (int32_t j=0; j< num_feat; j++)
00794 {
00795 int64_t pos= i*num_feat + j;
00796
00797 if (src[pos] != 0)
00798 {
00799 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00800 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00801 sparse_feat_idx++;
00802 num_total_entries++;
00803 }
00804 }
00805 }
00806 }
00807 }
00808 else
00809 {
00810 SG_ERROR( "allocation of sparse feature matrix failed\n");
00811 result=false;
00812 }
00813
00814 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00815 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00816 }
00817 else
00818 {
00819 SG_ERROR( "huh ? zero size matrix given ?\n");
00820 result=false;
00821 }
00822 }
00823 SG_FREE(num_feat_entries);
00824 return result;
00825 }
00826
00834 virtual bool apply_preprocessor(bool force_preprocessing=false)
00835 {
00836 SG_INFO( "force: %d\n", force_preprocessing);
00837
00838 if ( sparse_feature_matrix && get_num_preprocessors() )
00839 {
00840 for (int32_t i=0; i<get_num_preprocessors(); i++)
00841 {
00842 if ( (!is_preprocessed(i) || force_preprocessing) )
00843 {
00844 set_preprocessed(i);
00845 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00846 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00847 return false;
00848 }
00849 return true;
00850 }
00851 return true;
00852 }
00853 else
00854 {
00855 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00856 return false;
00857 }
00858 }
00859
00864 virtual int32_t get_size() { return sizeof(ST); }
00865
00873 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00874 {
00875 SGMatrix<ST> fm=sf->get_feature_matrix();
00876 ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00877
00878 return set_full_feature_matrix(fm);
00879 }
00880
00885 virtual inline int32_t get_num_vectors() const
00886 {
00887 return m_subset ? m_subset->get_size() : num_vectors;
00888 }
00889
00894 inline int32_t get_num_features() { return num_features; }
00895
00907 inline int32_t set_num_features(int32_t num)
00908 {
00909 int32_t n=num_features;
00910 ASSERT(n<=num);
00911 num_features=num;
00912 return num_features;
00913 }
00914
00919 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00920
00925 inline virtual EFeatureType get_feature_type();
00926
00934 void free_feature_vector(SGSparseVector<ST> vec, int32_t num)
00935 {
00936 if (feature_cache)
00937 feature_cache->unlock_entry(subset_idx_conversion(num));
00938
00939 vec.free_vector();
00940 }
00941
00946 int64_t get_num_nonzero_entries()
00947 {
00948 int64_t num=0;
00949 index_t num_vec=get_num_vectors();
00950 for (int32_t i=0; i<num_vec; i++)
00951 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
00952
00953 return num;
00954 }
00955
00963 float64_t* compute_squared(float64_t* sq)
00964 {
00965 ASSERT(sq);
00966
00967 index_t num_vec=get_num_vectors();
00968 for (int32_t i=0; i<num_vec; i++)
00969 {
00970 sq[i]=0;
00971 SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00972
00973 for (int32_t j=0; j<vec.num_feat_entries; j++)
00974 sq[i]+=vec.features[j].entry*vec.features[j].entry;
00975
00976 free_feature_vector(vec, i);
00977 }
00978
00979 return sq;
00980 }
00981
00996 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00997 {
00998 int32_t i,j;
00999 ASSERT(lhs);
01000 ASSERT(rhs);
01001
01002 SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
01003 SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
01004 ASSERT(avec.features);
01005 ASSERT(bvec.features);
01006
01007 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
01008
01009 if (avec.num_feat_entries<=bvec.num_feat_entries)
01010 {
01011 j=0;
01012 for (i=0; i<avec.num_feat_entries; i++)
01013 {
01014 int32_t a_feat_idx=avec.features[i].feat_index;
01015
01016 while ((j<bvec.num_feat_entries)
01017 &&(bvec.features[j].feat_index<a_feat_idx))
01018 j++;
01019
01020 if ((j<bvec.num_feat_entries)
01021 &&(bvec.features[j].feat_index==a_feat_idx))
01022 {
01023 result-=2*(avec.features[i].entry*bvec.features[j].entry);
01024 j++;
01025 }
01026 }
01027 }
01028 else
01029 {
01030 j=0;
01031 for (i=0; i<bvec.num_feat_entries; i++)
01032 {
01033 int32_t b_feat_idx=bvec.features[i].feat_index;
01034
01035 while ((j<avec.num_feat_entries)
01036 &&(avec.features[j].feat_index<b_feat_idx))
01037 j++;
01038
01039 if ((j<avec.num_feat_entries)
01040 &&(avec.features[j].feat_index==b_feat_idx))
01041 {
01042 result-=2*(bvec.features[i].entry*avec.features[j].entry);
01043 j++;
01044 }
01045 }
01046 }
01047
01048 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a);
01049 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b);
01050
01051 return CMath::abs(result);
01052 }
01053
01060 void load(CFile* loader);
01061
01068 void save(CFile* writer);
01069
01079 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01080 {
01081 remove_subset();
01082
01083 CLabels* lab=NULL;
01084
01085 size_t blocksize=1024*1024;
01086 size_t required_blocksize=blocksize;
01087 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
01088 FILE* f=fopen(fname, "ro");
01089
01090 if (f)
01091 {
01092 free_sparse_feature_matrix();
01093 num_vectors=0;
01094 num_features=0;
01095
01096 SG_INFO("counting line numbers in file %s\n", fname);
01097 size_t sz=blocksize;
01098 size_t block_offs=0;
01099 size_t old_block_offs=0;
01100 fseek(f, 0, SEEK_END);
01101 size_t fsize=ftell(f);
01102 rewind(f);
01103
01104 while (sz == blocksize)
01105 {
01106 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01107 bool contains_cr=false;
01108 for (size_t i=0; i<sz; i++)
01109 {
01110 block_offs++;
01111 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01112 {
01113 num_vectors++;
01114 contains_cr=true;
01115 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01116 old_block_offs=block_offs;
01117 }
01118 }
01119 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01120 }
01121
01122 SG_INFO("found %d feature vectors\n", num_vectors);
01123 SG_FREE(dummy);
01124 blocksize=required_blocksize;
01125 dummy = SG_MALLOC(uint8_t, blocksize+1);
01126
01127 lab=new CLabels(num_vectors);
01128 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
01129
01130 rewind(f);
01131 sz=blocksize;
01132 int32_t lines=0;
01133 while (sz == blocksize)
01134 {
01135 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01136
01137 size_t old_sz=0;
01138 for (size_t i=0; i<sz; i++)
01139 {
01140 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01141 {
01142 size_t len=i-old_sz+1;
01143 uint8_t* data=&dummy[old_sz];
01144
01145 for (int32_t j=0; j<len; j++)
01146 dummy[j]=data[j];
01147
01148 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01149 i=0;
01150 old_sz=0;
01151 sz+=len;
01152 }
01153
01154 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01155 {
01156
01157 size_t len=i-old_sz;
01158 uint8_t* data=&dummy[old_sz];
01159
01160 int32_t dims=0;
01161 for (int32_t j=0; j<len; j++)
01162 {
01163 if (data[j]==':')
01164 dims++;
01165 }
01166
01167 if (dims<=0)
01168 {
01169 SG_ERROR("Error in line %d - number of"
01170 " dimensions is %d line is %d characters"
01171 " long\n line_content:'%.*s'\n", lines,
01172 dims, len, len, (const char*) data);
01173 }
01174
01175 SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
01176 int32_t j=0;
01177 for (; j<len; j++)
01178 {
01179 if (data[j]==' ')
01180 {
01181 data[j]='\0';
01182
01183 lab->set_label(lines, atof((const char*) data));
01184 break;
01185 }
01186 }
01187
01188 int32_t d=0;
01189 j++;
01190 uint8_t* start=&data[j];
01191 for (; j<len; j++)
01192 {
01193 if (data[j]==':')
01194 {
01195 data[j]='\0';
01196
01197 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01198 num_features=CMath::max(num_features, feat[d].feat_index+1);
01199
01200 j++;
01201 start=&data[j];
01202 for (; j<len; j++)
01203 {
01204 if (data[j]==' ' || data[j]=='\n')
01205 {
01206 data[j]='\0';
01207 feat[d].entry=(ST) atof((const char*) start);
01208 d++;
01209 break;
01210 }
01211 }
01212
01213 if (j==len)
01214 {
01215 data[j]='\0';
01216 feat[dims-1].entry=(ST) atof((const char*) start);
01217 }
01218
01219 j++;
01220 start=&data[j];
01221 }
01222 }
01223
01224 sparse_feature_matrix[lines].vec_index=lines;
01225 sparse_feature_matrix[lines].num_feat_entries=dims;
01226 sparse_feature_matrix[lines].features=feat;
01227
01228 old_sz=i+1;
01229 lines++;
01230 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01231 }
01232 }
01233 }
01234 SG_INFO("file successfully read\n");
01235 fclose(f);
01236 }
01237
01238 SG_FREE(dummy);
01239
01240 if (do_sort_features)
01241 sort_features();
01242
01243 return lab;
01244 }
01245
01251 void sort_features()
01252 {
01253 if (m_subset)
01254 SG_ERROR("sort_features() not allowed with subset\n");
01255
01256 ASSERT(get_num_preprocessors()==0);
01257
01258 if (!sparse_feature_matrix)
01259 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01260
01261 for (int32_t i=0; i<num_vectors; i++)
01262 {
01263 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01264
01265 if (!len)
01266 continue;
01267
01268 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01269 int32_t* feat_idx=SG_MALLOC(int32_t, len);
01270 int32_t* orig_idx=SG_MALLOC(int32_t, len);
01271
01272 for (int j=0; j<len; j++)
01273 {
01274 feat_idx[j]=sf_orig[j].feat_index;
01275 orig_idx[j]=j;
01276 }
01277
01278 CMath::qsort_index(feat_idx, orig_idx, len);
01279
01280 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
01281 for (int j=0; j<len; j++)
01282 sf_new[j]=sf_orig[orig_idx[j]];
01283
01284 sparse_feature_matrix[i].features=sf_new;
01285
01286
01287 for (int j=0; j<len-1; j++)
01288 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01289
01290 SG_FREE(orig_idx);
01291 SG_FREE(feat_idx);
01292 SG_FREE(sf_orig);
01293 }
01294 }
01295
01304 bool write_svmlight_file(char* fname, CLabels* label)
01305 {
01306 if (m_subset)
01307 SG_ERROR("write_svmlight_file() not allowed with subset\n");
01308
01309 ASSERT(label);
01310 int32_t num=label->get_num_labels();
01311 ASSERT(num>0);
01312 ASSERT(num==num_vectors);
01313
01314 FILE* f=fopen(fname, "wb");
01315
01316 if (f)
01317 {
01318 for (int32_t i=0; i<num; i++)
01319 {
01320 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01321
01322 SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
01323 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01324
01325 for (int32_t j=0; j<num_feat; j++)
01326 {
01327 if (j<num_feat-1)
01328 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01329 else
01330 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01331 }
01332 }
01333
01334 fclose(f);
01335 return true;
01336 }
01337 return false;
01338 }
01339
01347 virtual int32_t get_dim_feature_space() const
01348 {
01349 return num_features;
01350 }
01351
01361 virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
01362 {
01363 ASSERT(df);
01364 ASSERT(df->get_feature_type() == get_feature_type());
01365 ASSERT(df->get_feature_class() == get_feature_class());
01366 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01367
01368 SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
01369 SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
01370
01371 float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
01372 bvec.features, bvec.num_feat_entries);
01373
01374 free_sparse_feature_vector(avec, vec_idx1);
01375 sf->free_sparse_feature_vector(bvec, vec_idx2);
01376
01377 return result;
01378 }
01379
01388 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01389 {
01390 ASSERT(vec2);
01391 if (vec2_len!=num_features)
01392 {
01393 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01394 vec2_len, num_features);
01395 }
01396 float64_t result=0;
01397
01398 SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
01399
01400 if (sv.features)
01401 {
01402 for (int32_t i=0; i<sv.num_feat_entries; i++)
01403 result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
01404 }
01405
01406 free_sparse_feature_vector(sv, vec_idx1);
01407
01408 return result;
01409 }
01410
01411 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01412
01413 struct sparse_feature_iterator
01414 {
01416 SGSparseVector<ST> sv;
01417
01419 int32_t index;
01420
01422 void print_info()
01423 {
01424 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01425 sv.features, sv.vec_index, sv.num_feat_entries, index);
01426 }
01427 };
01428 #endif
01429
01441 virtual void* get_feature_iterator(int32_t vector_index)
01442 {
01443 if (vector_index>=get_num_vectors())
01444 {
01445 SG_ERROR("Index out of bounds (number of vectors %d, you "
01446 "requested %d)\n", get_num_vectors(), vector_index);
01447 }
01448
01449 if (!sparse_feature_matrix)
01450 SG_ERROR("Requires a in-memory feature matrix\n");
01451
01452 sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
01453 it->sv=get_sparse_feature_vector(vector_index);
01454 it->index=0;
01455
01456 return it;
01457 }
01458
01469 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01470 {
01471 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01472 if (!it || it->index>=it->sv.num_feat_entries)
01473 return false;
01474
01475 int32_t i=it->index++;
01476
01477 index=it->sv.features[i].feat_index;
01478 value=(float64_t) it->sv.features[i].entry;
01479
01480 return true;
01481 }
01482
01488 virtual void free_feature_iterator(void* iterator)
01489 {
01490 if (!iterator)
01491 return;
01492
01493 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01494 free_sparse_feature_vector(it->sv, it->sv.vec_index);
01495 SG_FREE(it);
01496 }
01497
01504 virtual CFeatures* copy_subset(SGVector<index_t> indices)
01505 {
01506 SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01507 get_dim_feature_space());
01508
01509 for (index_t i=0; i<indices.vlen; ++i)
01510 {
01511
01512 index_t index=indices.vector[i];
01513
01514
01515 SGSparseVector<ST> current=get_sparse_feature_vector(index);
01516 matrix_copy.sparse_matrix[i]=SGSparseVector<ST>(
01517 current.num_feat_entries, current.vec_index);
01518
01519
01520 memcpy(matrix_copy.sparse_matrix[i].features, current.features,
01521 sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries);
01522
01523 free_sparse_feature_vector(current, index);
01524 }
01525
01526 return new CSparseFeatures<ST>(matrix_copy);
01527 }
01528
01530 inline virtual const char* get_name() const { return "SparseFeatures"; }
01531
01532 protected:
01543 virtual SGSparseVectorEntry<ST>* compute_sparse_feature_vector(int32_t num,
01544 int32_t& len, SGSparseVectorEntry<ST>* target=NULL)
01545 {
01546 SG_NOTIMPLEMENTED;
01547
01548 len=0;
01549 return NULL;
01550 }
01551
01552 private:
01553 void init(void)
01554 {
01555 set_generic<ST>();
01556
01557 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01558 "sparse_feature_matrix",
01559 "Array of sparse vectors.");
01560 m_parameters->add(&num_features, "num_features",
01561 "Total number of features.");
01562 }
01563
01564
01565 protected:
01566
01568 int32_t num_vectors;
01569
01571 int32_t num_features;
01572
01574 SGSparseVector<ST>* sparse_feature_matrix;
01575
01577 CCache< SGSparseVectorEntry<ST> >* feature_cache;
01578 };
01579
01580 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01581 #define GET_FEATURE_TYPE(sg_type, f_type) \
01582 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01583 { \
01584 return f_type; \
01585 }
01586 GET_FEATURE_TYPE(bool, F_BOOL)
01587 GET_FEATURE_TYPE(char, F_CHAR)
01588 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01589 GET_FEATURE_TYPE(int8_t, F_BYTE)
01590 GET_FEATURE_TYPE(int16_t, F_SHORT)
01591 GET_FEATURE_TYPE(uint16_t, F_WORD)
01592 GET_FEATURE_TYPE(int32_t, F_INT)
01593 GET_FEATURE_TYPE(uint32_t, F_UINT)
01594 GET_FEATURE_TYPE(int64_t, F_LONG)
01595 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01596 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01597 GET_FEATURE_TYPE(float64_t, F_DREAL)
01598 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01599 #undef GET_FEATURE_TYPE
01600
01601 #define LOAD(fname, sg_type) \
01602 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader) \
01603 { \
01604 remove_subset(); \
01605 SG_SET_LOCALE_C; \
01606 ASSERT(loader); \
01607 SGSparseVector<sg_type>* matrix=NULL; \
01608 int32_t num_feat=0; \
01609 int32_t num_vec=0; \
01610 loader->fname(matrix, num_feat, num_vec); \
01611 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
01612 SG_RESET_LOCALE; \
01613 }
01614 LOAD(get_sparse_matrix, bool)
01615 LOAD(get_sparse_matrix, char)
01616 LOAD(get_sparse_matrix, uint8_t)
01617 LOAD(get_int8_sparsematrix, int8_t)
01618 LOAD(get_sparse_matrix, int16_t)
01619 LOAD(get_sparse_matrix, uint16_t)
01620 LOAD(get_sparse_matrix, int32_t)
01621 LOAD(get_uint_sparsematrix, uint32_t)
01622 LOAD(get_long_sparsematrix, int64_t)
01623 LOAD(get_ulong_sparsematrix, uint64_t)
01624 LOAD(get_sparse_matrix, float32_t)
01625 LOAD(get_sparse_matrix, float64_t)
01626 LOAD(get_longreal_sparsematrix, floatmax_t)
01627 #undef LOAD
01628
01629 #define WRITE(fname, sg_type) \
01630 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer) \
01631 { \
01632 if (m_subset) \
01633 SG_ERROR("save() not allowed with subset\n"); \
01634 SG_SET_LOCALE_C; \
01635 ASSERT(writer); \
01636 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
01637 SG_RESET_LOCALE; \
01638 }
01639 WRITE(set_sparse_matrix, bool)
01640 WRITE(set_sparse_matrix, char)
01641 WRITE(set_sparse_matrix, uint8_t)
01642 WRITE(set_int8_sparsematrix, int8_t)
01643 WRITE(set_sparse_matrix, int16_t)
01644 WRITE(set_sparse_matrix, uint16_t)
01645 WRITE(set_sparse_matrix, int32_t)
01646 WRITE(set_uint_sparsematrix, uint32_t)
01647 WRITE(set_long_sparsematrix, int64_t)
01648 WRITE(set_ulong_sparsematrix, uint64_t)
01649 WRITE(set_sparse_matrix, float32_t)
01650 WRITE(set_sparse_matrix, float64_t)
01651 WRITE(set_longreal_sparsematrix, floatmax_t)
01652 #undef WRITE
01653 #endif // DOXYGEN_SHOULD_SKIP_THIS
01654 }
01655 #endif