SHOGUN: SparseFeatures.h Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2010 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Subset support written (W) 2011 Heiko Strathmann
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  * Copyright (C) 2010 Berlin Institute of Technology
00012  */
00013 
00014 #ifndef _SPARSEFEATURES__H__
00015 #define _SPARSEFEATURES__H__
00016 
00017 #include <string.h>
00018 #include <stdlib.h>
00019 
00020 #include <shogun/lib/common.h>
00021 #include <shogun/mathematics/Math.h>
00022 #include <shogun/lib/Cache.h>
00023 #include <shogun/io/SGIO.h>
00024 #include <shogun/lib/Cache.h>
00025 #include <shogun/io/File.h>
00026 #include <shogun/lib/DataType.h>
00027 
00028 #include <shogun/features/Labels.h>
00029 #include <shogun/features/Features.h>
00030 #include <shogun/features/DotFeatures.h>
00031 #include <shogun/features/SimpleFeatures.h>
00032 #include <shogun/preprocessor/SparsePreprocessor.h>
00033 
00034 namespace shogun
00035 {
00036 
00037 class CFile;
00038 class CLabels;
00039 class CFeatures;
00040 class CDotFeatures;
00041 template <class ST> class CSimpleFeatures;
00042 template <class ST> class CSparsePreprocessor;
00043 
00061 template <class ST> class CSparseFeatures : public CDotFeatures
00062 {
00063     public:
00068         CSparseFeatures(int32_t size=0)
00069         : CDotFeatures(size), num_vectors(0), num_features(0),
00070             sparse_feature_matrix(NULL), feature_cache(NULL)
00071         { init(); }
00072 
00081         CSparseFeatures(SGSparseVector<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00082         : CDotFeatures(0), num_vectors(0), num_features(0),
00083             sparse_feature_matrix(NULL), feature_cache(NULL)
00084         {
00085             init();
00086 
00087             if (!copy)
00088                 set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec));
00089             else
00090             {
00091                 sparse_feature_matrix = SG_MALLOC(SGSparseVector<ST>, num_vec);
00092                 memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec);
00093                 for (int32_t i=0; i< num_vec; i++)
00094                 {
00095                     sparse_feature_matrix[i].features = SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00096                     memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00097 
00098                 }
00099             }
00100         }
00101 
00107         CSparseFeatures(SGSparseMatrix<ST> sparse)
00108         : CDotFeatures(0), num_vectors(0), num_features(0),
00109             sparse_feature_matrix(NULL), feature_cache(NULL)
00110         {
00111             init();
00112 
00113             set_sparse_feature_matrix(sparse);
00114         }
00115 
00121         CSparseFeatures(SGMatrix<ST> dense)
00122         : CDotFeatures(0), num_vectors(0), num_features(0),
00123             sparse_feature_matrix(NULL), feature_cache(NULL)
00124         {
00125             init();
00126 
00127             set_full_feature_matrix(dense);
00128         }
00129 
00131         CSparseFeatures(const CSparseFeatures & orig)
00132         : CDotFeatures(orig), num_vectors(orig.num_vectors),
00133             num_features(orig.num_features),
00134             sparse_feature_matrix(orig.sparse_feature_matrix),
00135             feature_cache(orig.feature_cache)
00136         {
00137             init();
00138 
00139             if (orig.sparse_feature_matrix)
00140             {
00141                 free_sparse_feature_matrix();
00142                 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00143                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors);
00144                 for (int32_t i=0; i< num_vectors; i++)
00145                 {
00146                     sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00147                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00148 
00149                 }
00150             }
00151 
00152             m_subset=orig.m_subset->duplicate();
00153         }
00154 
00159         CSparseFeatures(CFile* loader)
00160         : CDotFeatures(loader), num_vectors(0), num_features(0),
00161             sparse_feature_matrix(NULL), feature_cache(NULL)
00162         {
00163             init();
00164 
00165             load(loader);
00166         }
00167 
00169         virtual ~CSparseFeatures()
00170         {
00171             free_sparse_features();
00172         }
00173 
00178         void free_sparse_feature_matrix()
00179         {
00180             clean_tsparse(sparse_feature_matrix, num_vectors);
00181             sparse_feature_matrix = NULL;
00182             num_vectors=0;
00183             num_features=0;
00184             remove_subset();
00185         }
00186 
00191         void free_sparse_features()
00192         {
00193             free_sparse_feature_matrix();
00194             delete feature_cache;
00195             feature_cache = NULL;
00196         }
00197 
00202         virtual CFeatures* duplicate() const
00203         {
00204             return new CSparseFeatures<ST>(*this);
00205         }
00206 
00216         ST get_feature(int32_t num, int32_t index)
00217         {
00218             ASSERT(index>=0 && index<num_features) ;
00219             ASSERT(num>=0 && num<get_num_vectors()) ;
00220 
00221             int32_t i;
00222             SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00223             ST ret = 0 ;
00224             
00225             if (sv.features)
00226             {
00227                 for (i=0; i<sv.num_feat_entries; i++)
00228                     if (sv.features[i].feat_index==index)
00229                         ret+=sv.features[i].entry ;
00230             }
00231             
00232             free_sparse_feature_vector(sv, num);
00233             
00234             return ret ;
00235         }
00236         
00237 
00246         ST* get_full_feature_vector(int32_t num, int32_t& len)
00247         {
00248             int32_t i;
00249             len=0;
00250             SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00251             ST* fv=NULL;
00252 
00253             if (sv.features)
00254             {
00255                 len=num_features;
00256                 fv=SG_MALLOC(ST, num_features);
00257 
00258                 for (i=0; i<num_features; i++)
00259                     fv[i]=0;
00260 
00261                 for (i=0; i<sv.num_feat_entries; i++)
00262                     fv[sv.features[i].feat_index]= sv.features[i].entry;
00263             }
00264 
00265             free_sparse_feature_vector(sv, num);
00266 
00267             return fv;
00268         }
00269 
00275         SGVector<ST> get_full_feature_vector(int32_t num)
00276         {
00277             if (num>=num_vectors)
00278             {
00279                 SG_ERROR("Index out of bounds (number of vectors %d, you "
00280                         "requested %d)\n", num_vectors, num);
00281             }
00282 
00283             SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00284 
00285             SGVector<ST> dense;
00286 
00287             if (sv.features)
00288             {
00289                 dense.do_free=true;
00290                 dense.vlen=num_features;
00291                 dense.vector=SG_MALLOC(ST, num_features);
00292                 memset(dense.vector, 0, sizeof(ST)*num_features);
00293 
00294                 for (int32_t i=0; i<sv.num_feat_entries; i++)
00295                 {
00296                     dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00297                 }
00298             }
00299 
00300             free_sparse_feature_vector(sv, num);
00301 
00302             return dense;
00303         }
00304 
00305 
00311         virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00312         {
00313             SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00314             int32_t len=sv.num_feat_entries;
00315             free_sparse_feature_vector(sv, num);
00316             return len;
00317         }
00318 
00328         SGSparseVector<ST> get_sparse_feature_vector(int32_t num)
00329         {
00330             ASSERT(num<get_num_vectors());
00331 
00332             index_t real_num=subset_idx_conversion(num);
00333 
00334             SGSparseVector<ST> result;
00335 
00336             if (sparse_feature_matrix)
00337             {
00338                 result=sparse_feature_matrix[real_num];
00339                 result.do_free=false;
00340                 return result;
00341             } 
00342             else
00343             {
00344                 result.do_free=false;
00345 
00346                 if (feature_cache)
00347                 {
00348                     result.features=feature_cache->lock_entry(num);
00349 
00350                     if (result.features)
00351                         return result;
00352                     else
00353                     {
00354                         result.features=feature_cache->set_entry(num);
00355                     }
00356                 }
00357 
00358                 if (!result.features)
00359                     result.do_free=true;
00360 
00361                 result.features=compute_sparse_feature_vector(num,
00362                     result.num_feat_entries, result.features);
00363 
00364 
00365                 if (get_num_preprocessors())
00366                 {
00367                     int32_t tmp_len=result.num_feat_entries;
00368                     SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00369                     SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00370 
00371                     for (int32_t i=0; i<get_num_preprocessors(); i++)
00372                     {
00373                         //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00374 
00375                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00376                             SG_FREE(tmp_feat_before);
00377                         tmp_feat_before=tmp_feat_after;
00378                     }
00379 
00380                     memcpy(result.features, tmp_feat_after,
00381                             sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00382 
00383                     SG_FREE(tmp_feat_after);
00384                     result.num_feat_entries=tmp_len ;
00385                     SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00386                 }
00387                 result.vec_index=num;
00388                 return result ;
00389             }
00390         }
00391 
00392 
00403         static ST sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
00404         {
00405             ST result=0;
00406 
00407             //result remains zero when one of the vectors is non existent
00408             if (avec && bvec)
00409             {
00410                 if (alen<=blen)
00411                 {
00412                     int32_t j=0;
00413                     for (int32_t i=0; i<alen; i++)
00414                     {
00415                         int32_t a_feat_idx=avec[i].feat_index;
00416 
00417                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00418                             j++;
00419 
00420                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00421                         {
00422                             result+= avec[i].entry * bvec[j].entry;
00423                             j++;
00424                         }
00425                     }
00426                 }
00427                 else
00428                 {
00429                     int32_t j=0;
00430                     for (int32_t i=0; i<blen; i++)
00431                     {
00432                         int32_t b_feat_idx=bvec[i].feat_index;
00433 
00434                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00435                             j++;
00436 
00437                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00438                         {
00439                             result+= bvec[i].entry * avec[j].entry;
00440                             j++;
00441                         }
00442                     }
00443                 }
00444 
00445                 result*=alpha;
00446             }
00447 
00448             return result;
00449         }
00450 
00463         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00464         {
00465             ASSERT(vec);
00466             ASSERT(dim==num_features);
00467             ST result=b;
00468 
00469             SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00470 
00471             if (sv.features)
00472             {
00473                 for (int32_t i=0; i<sv.num_feat_entries; i++)
00474                 {
00475                     result+=alpha*vec[sv.features[i].feat_index]
00476                         *sv.features[i].entry;
00477                 }
00478             }
00479 
00480             free_sparse_feature_vector(sv, num);
00481             return result;
00482         }
00483 
00495         void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00496         {
00497             ASSERT(vec);
00498             if (dim!=num_features)
00499             {
00500                 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00501                         dim, num_features);
00502             }
00503 
00504             SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00505 
00506             if (sv.features)
00507             {
00508                 if (abs_val)
00509                 {
00510                     for (int32_t i=0; i<sv.num_feat_entries; i++)
00511                     {
00512                         vec[sv.features[i].feat_index]+=alpha
00513                             *CMath::abs(sv.features[i].entry);
00514                     }
00515                 }
00516                 else
00517                 {
00518                     for (int32_t i=0; i<sv.num_feat_entries; i++)
00519                     {
00520                         vec[sv.features[i].feat_index]+=alpha
00521                                 *sv.features[i].entry;
00522                     }
00523                 }
00524             }
00525 
00526             free_sparse_feature_vector(sv, num);
00527         }
00528 
00536         void free_sparse_feature_vector(SGSparseVector<ST> vec, int32_t num)
00537         {
00538             if (feature_cache)
00539                 feature_cache->unlock_entry(subset_idx_conversion(num));
00540 
00541             vec.free_vector();
00542         } 
00543 
00553         SGSparseVector<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00554         {
00555             if (m_subset)
00556                 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00557 
00558             num_feat=num_features;
00559             num_vec=num_vectors;
00560 
00561             return sparse_feature_matrix;
00562         }
00563 
00571         SGSparseMatrix<ST> get_sparse_feature_matrix()
00572         {
00573             if (m_subset)
00574                 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00575 
00576             SGSparseMatrix<ST> sm;
00577             sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00578             return sm;
00579         }
00580 
00586         static void clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec)
00587         {
00588             if (sfm)
00589             {
00590                 for (int32_t i=0; i<num_vec; i++)
00591                     SG_FREE(sfm[i].features);
00592 
00593                 SG_FREE(sfm);
00594             }
00595         }
00596 
00603         CSparseFeatures<ST>* get_transposed()
00604         {
00605             int32_t num_feat;
00606             int32_t num_vec;
00607             SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00608             return new CSparseFeatures<ST>(s, num_feat, num_vec);
00609         }
00610 
00622         SGSparseVector<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00623         {
00624             num_feat=get_num_vectors();
00625             num_vec=num_features;
00626 
00627             int32_t* hist=SG_MALLOC(int32_t, num_features);
00628             memset(hist, 0, sizeof(int32_t)*num_features);
00629 
00630             // count how lengths of future feature vectors
00631             for (int32_t v=0; v<num_feat; v++)
00632             {
00633                 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00634 
00635                 for (int32_t i=0; i<sv.num_feat_entries; i++)
00636                     hist[sv.features[i].feat_index]++;
00637 
00638                 free_sparse_feature_vector(sv, v);
00639             }
00640 
00641             // allocate room for future feature vectors
00642             SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00643             for (int32_t v=0; v<num_vec; v++)
00644             {
00645                 sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]);
00646                 sfm[v].num_feat_entries=hist[v];
00647                 sfm[v].vec_index=v;
00648             }
00649 
00650             // fill future feature vectors with content
00651             memset(hist,0,sizeof(int32_t)*num_features);
00652             for (int32_t v=0; v<num_feat; v++)
00653             {
00654                 SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00655 
00656                 for (int32_t i=0; i<sv.num_feat_entries; i++)
00657                 {
00658                     int32_t vidx=sv.features[i].feat_index;
00659                     int32_t fidx=v;
00660                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00661                     sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00662                     hist[vidx]++;
00663                 }
00664 
00665                 free_sparse_feature_vector(sv, v);
00666             }
00667 
00668             SG_FREE(hist);
00669             return sfm;
00670         }
00671 
00679         void set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00680         {
00681             if (m_subset)
00682                 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00683 
00684 
00685             free_sparse_feature_matrix();
00686             sm.own_matrix();
00687 
00688             sparse_feature_matrix=sm.sparse_matrix;
00689             num_features=sm.num_features;
00690             num_vectors=sm.num_vectors;
00691         }
00692 
00699         SGMatrix<ST> get_full_feature_matrix()
00700         {
00701             SGMatrix<ST> full;
00702 
00703             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00704             full.num_rows=num_features;
00705             full.num_cols=get_num_vectors();
00706             full.do_free=true;
00707             full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors());
00708 
00709             memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST));
00710 
00711             for (int32_t v=0; v<full.num_cols; v++)
00712             {
00713                 SGSparseVector<ST> current=
00714                     sparse_feature_matrix[subset_idx_conversion(v)];
00715 
00716                 for (int32_t f=0; f<current.num_feat_entries; f++)
00717                 {
00718                     int64_t offs=(current.vec_index*num_features)
00719                             +current.features[f].feat_index;
00720 
00721                     full.matrix[offs]=current.features[f].entry;
00722                 }
00723             }
00724 
00725             return full;
00726         }
00727 
00737         virtual bool set_full_feature_matrix(SGMatrix<ST> full)
00738         {
00739             remove_subset();
00740 
00741             ST* src=full.matrix;
00742             int32_t num_feat=full.num_rows;
00743             int32_t num_vec=full.num_cols;
00744 
00745             free_sparse_feature_matrix();
00746             bool result=true;
00747             num_features=num_feat;
00748             num_vectors=num_vec;
00749 
00750             SG_INFO("converting dense feature matrix to sparse one\n");
00751             int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00752 
00753             if (num_feat_entries)
00754             {
00755                 int64_t num_total_entries=0;
00756 
00757                 // count nr of non sparse features
00758                 for (int32_t i=0; i< num_vec; i++)
00759                 {
00760                     num_feat_entries[i]=0;
00761                     for (int32_t j=0; j< num_feat; j++)
00762                     {
00763                         if (src[i*((int64_t) num_feat) + j] != 0)
00764                             num_feat_entries[i]++;
00765                     }
00766                 }
00767 
00768                 if (num_vec>0)
00769                 {
00770                     sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00771 
00772                     if (sparse_feature_matrix)
00773                     {
00774                         for (int32_t i=0; i< num_vec; i++)
00775                         {
00776                             sparse_feature_matrix[i].vec_index=i;
00777                             sparse_feature_matrix[i].num_feat_entries=0;
00778                             sparse_feature_matrix[i].features= NULL;
00779 
00780                             if (num_feat_entries[i]>0)
00781                             {
00782                                 sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]);
00783 
00784                                 if (!sparse_feature_matrix[i].features)
00785                                 {
00786                                     SG_INFO( "allocation of features failed\n");
00787                                     return false;
00788                                 }
00789 
00790                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00791                                 int32_t sparse_feat_idx=0;
00792 
00793                                 for (int32_t j=0; j< num_feat; j++)
00794                                 {
00795                                     int64_t pos= i*num_feat + j;
00796 
00797                                     if (src[pos] != 0)
00798                                     {
00799                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00800                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00801                                         sparse_feat_idx++;
00802                                         num_total_entries++;
00803                                     }
00804                                 }
00805                             }
00806                         }
00807                     }
00808                     else
00809                     {
00810                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00811                         result=false;
00812                     }
00813 
00814                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00815                             num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00816                 }
00817                 else
00818                 {
00819                     SG_ERROR( "huh ? zero size matrix given ?\n");
00820                     result=false;
00821                 }
00822             }
00823             SG_FREE(num_feat_entries);
00824             return result;
00825         }
00826 
00834         virtual bool apply_preprocessor(bool force_preprocessing=false)
00835         {
00836             SG_INFO( "force: %d\n", force_preprocessing);
00837 
00838             if ( sparse_feature_matrix && get_num_preprocessors() )
00839             {
00840                 for (int32_t i=0; i<get_num_preprocessors(); i++)
00841                 {
00842                     if ( (!is_preprocessed(i) || force_preprocessing) )
00843                     {
00844                         set_preprocessed(i);
00845                         SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00846                         if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00847                             return false;
00848                     }
00849                     return true;
00850                 }
00851                 return true;
00852             }
00853             else
00854             {
00855                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00856                 return false;
00857             }
00858         }
00859 
00864         virtual int32_t get_size() { return sizeof(ST); }
00865 
00873         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00874         {
00875             SGMatrix<ST> fm=sf->get_feature_matrix();
00876             ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00877 
00878             return set_full_feature_matrix(fm);
00879         }
00880 
00885         virtual inline int32_t  get_num_vectors() const
00886         {
00887             return m_subset ? m_subset->get_size() : num_vectors;
00888         }
00889 
00894         inline int32_t  get_num_features() { return num_features; }
00895 
00907         inline int32_t set_num_features(int32_t num)
00908         {
00909             int32_t n=num_features;
00910             ASSERT(n<=num);
00911             num_features=num;
00912             return num_features;
00913         }
00914 
00919         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00920 
00925         inline virtual EFeatureType get_feature_type();
00926 
00934         void free_feature_vector(SGSparseVector<ST> vec, int32_t num)
00935         {
00936             if (feature_cache)
00937                 feature_cache->unlock_entry(subset_idx_conversion(num));
00938 
00939             vec.free_vector();
00940         }
00941 
00946         int64_t get_num_nonzero_entries()
00947         {
00948             int64_t num=0;
00949             index_t num_vec=get_num_vectors();
00950             for (int32_t i=0; i<num_vec; i++)
00951                 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
00952 
00953             return num;
00954         }
00955 
00963         float64_t* compute_squared(float64_t* sq)
00964         {
00965             ASSERT(sq);
00966 
00967             index_t num_vec=get_num_vectors();
00968             for (int32_t i=0; i<num_vec; i++)
00969             {
00970                 sq[i]=0;
00971                 SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00972 
00973                 for (int32_t j=0; j<vec.num_feat_entries; j++)
00974                     sq[i]+=vec.features[j].entry*vec.features[j].entry;
00975 
00976                 free_feature_vector(vec, i);
00977             }
00978 
00979             return sq;
00980         }
00981 
00996         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00997         {
00998             int32_t i,j;
00999             ASSERT(lhs);
01000             ASSERT(rhs);
01001 
01002             SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
01003             SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
01004             ASSERT(avec.features);
01005             ASSERT(bvec.features);
01006 
01007             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
01008 
01009             if (avec.num_feat_entries<=bvec.num_feat_entries)
01010             {
01011                 j=0;
01012                 for (i=0; i<avec.num_feat_entries; i++)
01013                 {
01014                     int32_t a_feat_idx=avec.features[i].feat_index;
01015 
01016                     while ((j<bvec.num_feat_entries)
01017                             &&(bvec.features[j].feat_index<a_feat_idx))
01018                         j++;
01019 
01020                     if ((j<bvec.num_feat_entries)
01021                             &&(bvec.features[j].feat_index==a_feat_idx))
01022                     {
01023                         result-=2*(avec.features[i].entry*bvec.features[j].entry);
01024                         j++;
01025                     }
01026                 }
01027             }
01028             else
01029             {
01030                 j=0;
01031                 for (i=0; i<bvec.num_feat_entries; i++)
01032                 {
01033                     int32_t b_feat_idx=bvec.features[i].feat_index;
01034 
01035                     while ((j<avec.num_feat_entries)
01036                             &&(avec.features[j].feat_index<b_feat_idx))
01037                         j++;
01038 
01039                     if ((j<avec.num_feat_entries)
01040                             &&(avec.features[j].feat_index==b_feat_idx))
01041                     {
01042                         result-=2*(bvec.features[i].entry*avec.features[j].entry);
01043                         j++;
01044                     }
01045                 }
01046             }
01047 
01048             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a);
01049             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b);
01050 
01051             return CMath::abs(result);
01052         }
01053 
01060         void load(CFile* loader);
01061 
01068         void save(CFile* writer);
01069 
01079         CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01080         {
01081             remove_subset();
01082 
01083             CLabels* lab=NULL;
01084 
01085             size_t blocksize=1024*1024;
01086             size_t required_blocksize=blocksize;
01087             uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
01088             FILE* f=fopen(fname, "ro");
01089 
01090             if (f)
01091             {
01092                 free_sparse_feature_matrix();
01093                 num_vectors=0;
01094                 num_features=0;
01095 
01096                 SG_INFO("counting line numbers in file %s\n", fname);
01097                 size_t sz=blocksize;
01098                 size_t block_offs=0;
01099                 size_t old_block_offs=0;
01100                 fseek(f, 0, SEEK_END);
01101                 size_t fsize=ftell(f);
01102                 rewind(f);
01103 
01104                 while (sz == blocksize)
01105                 {
01106                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01107                     bool contains_cr=false;
01108                     for (size_t i=0; i<sz; i++)
01109                     {
01110                         block_offs++;
01111                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01112                         {
01113                             num_vectors++;
01114                             contains_cr=true;
01115                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01116                             old_block_offs=block_offs;
01117                         }
01118                     }
01119                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01120                 }
01121 
01122                 SG_INFO("found %d feature vectors\n", num_vectors);
01123                 SG_FREE(dummy);
01124                 blocksize=required_blocksize;
01125                 dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
01126 
01127                 lab=new CLabels(num_vectors);
01128                 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
01129 
01130                 rewind(f);
01131                 sz=blocksize;
01132                 int32_t lines=0;
01133                 while (sz == blocksize)
01134                 {
01135                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01136 
01137                     size_t old_sz=0;
01138                     for (size_t i=0; i<sz; i++)
01139                     {
01140                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01141                         {
01142                             size_t len=i-old_sz+1;
01143                             uint8_t* data=&dummy[old_sz];
01144 
01145                             for (int32_t j=0; j<len; j++)
01146                                 dummy[j]=data[j];
01147 
01148                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01149                             i=0;
01150                             old_sz=0;
01151                             sz+=len;
01152                         }
01153 
01154                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01155                         {
01156 
01157                             size_t len=i-old_sz;
01158                             uint8_t* data=&dummy[old_sz];
01159 
01160                             int32_t dims=0;
01161                             for (int32_t j=0; j<len; j++)
01162                             {
01163                                 if (data[j]==':')
01164                                     dims++;
01165                             }
01166 
01167                             if (dims<=0)
01168                             {
01169                                 SG_ERROR("Error in line %d - number of"
01170                                         " dimensions is %d line is %d characters"
01171                                         " long\n line_content:'%.*s'\n", lines,
01172                                         dims, len, len, (const char*) data);
01173                             }
01174 
01175                             SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
01176                             int32_t j=0;
01177                             for (; j<len; j++)
01178                             {
01179                                 if (data[j]==' ')
01180                                 {
01181                                     data[j]='\0';
01182 
01183                                     lab->set_label(lines, atof((const char*) data));
01184                                     break;
01185                                 }
01186                             }
01187 
01188                             int32_t d=0;
01189                             j++;
01190                             uint8_t* start=&data[j];
01191                             for (; j<len; j++)
01192                             {
01193                                 if (data[j]==':')
01194                                 {
01195                                     data[j]='\0';
01196 
01197                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01198                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
01199 
01200                                     j++;
01201                                     start=&data[j];
01202                                     for (; j<len; j++)
01203                                     {
01204                                         if (data[j]==' ' || data[j]=='\n')
01205                                         {
01206                                             data[j]='\0';
01207                                             feat[d].entry=(ST) atof((const char*) start);
01208                                             d++;
01209                                             break;
01210                                         }
01211                                     }
01212 
01213                                     if (j==len)
01214                                     {
01215                                         data[j]='\0';
01216                                         feat[dims-1].entry=(ST) atof((const char*) start);
01217                                     }
01218 
01219                                     j++;
01220                                     start=&data[j];
01221                                 }
01222                             }
01223 
01224                             sparse_feature_matrix[lines].vec_index=lines;
01225                             sparse_feature_matrix[lines].num_feat_entries=dims;
01226                             sparse_feature_matrix[lines].features=feat;
01227 
01228                             old_sz=i+1;
01229                             lines++;
01230                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01231                         }
01232                     }
01233                 }
01234                 SG_INFO("file successfully read\n");
01235                 fclose(f);
01236             }
01237 
01238             SG_FREE(dummy);
01239 
01240             if (do_sort_features)
01241                 sort_features();
01242 
01243             return lab;
01244         }
01245 
01251         void sort_features()
01252         {
01253             if (m_subset)
01254                 SG_ERROR("sort_features() not allowed with subset\n");
01255 
01256             ASSERT(get_num_preprocessors()==0);
01257 
01258             if (!sparse_feature_matrix)
01259                 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01260 
01261             for (int32_t i=0; i<num_vectors; i++)
01262             {
01263                 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01264 
01265                 if (!len)
01266                     continue;
01267 
01268                 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01269                 int32_t* feat_idx=SG_MALLOC(int32_t, len);
01270                 int32_t* orig_idx=SG_MALLOC(int32_t, len);
01271 
01272                 for (int j=0; j<len; j++)
01273                 {
01274                     feat_idx[j]=sf_orig[j].feat_index;
01275                     orig_idx[j]=j;
01276                 }
01277 
01278                 CMath::qsort_index(feat_idx, orig_idx, len);
01279 
01280                 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
01281                 for (int j=0; j<len; j++)
01282                     sf_new[j]=sf_orig[orig_idx[j]];
01283 
01284                 sparse_feature_matrix[i].features=sf_new;
01285 
01286                 // sanity check
01287                 for (int j=0; j<len-1; j++)
01288                     ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01289 
01290                 SG_FREE(orig_idx);
01291                 SG_FREE(feat_idx);
01292                 SG_FREE(sf_orig);
01293             }
01294         }
01295 
01304         bool write_svmlight_file(char* fname, CLabels* label)
01305         {
01306             if (m_subset)
01307                 SG_ERROR("write_svmlight_file() not allowed with subset\n");
01308 
01309             ASSERT(label);
01310             int32_t num=label->get_num_labels();
01311             ASSERT(num>0);
01312             ASSERT(num==num_vectors);
01313 
01314             FILE* f=fopen(fname, "wb");
01315 
01316             if (f)
01317             {
01318                 for (int32_t i=0; i<num; i++)
01319                 {
01320                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01321 
01322                     SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
01323                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01324 
01325                     for (int32_t j=0; j<num_feat; j++)
01326                     {
01327                         if (j<num_feat-1)
01328                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01329                         else
01330                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01331                     }
01332                 }
01333 
01334                 fclose(f);
01335                 return true;
01336             }
01337             return false;
01338         }
01339 
01347         virtual int32_t get_dim_feature_space() const
01348         {
01349             return num_features;
01350         }
01351 
01361         virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
01362         {
01363             ASSERT(df);
01364             ASSERT(df->get_feature_type() == get_feature_type());
01365             ASSERT(df->get_feature_class() == get_feature_class());
01366             CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01367 
01368             SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
01369             SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
01370 
01371             float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
01372                 bvec.features, bvec.num_feat_entries);
01373 
01374             free_sparse_feature_vector(avec, vec_idx1);
01375             sf->free_sparse_feature_vector(bvec, vec_idx2);
01376 
01377             return result;
01378         }
01379 
01388         virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01389         {
01390             ASSERT(vec2);
01391             if (vec2_len!=num_features)
01392             {
01393                 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01394                         vec2_len, num_features);
01395             }
01396             float64_t result=0;
01397 
01398             SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
01399 
01400             if (sv.features)
01401             {
01402                 for (int32_t i=0; i<sv.num_feat_entries; i++)
01403                     result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
01404             }
01405 
01406             free_sparse_feature_vector(sv, vec_idx1);
01407 
01408             return result;
01409         }
01410 
01411         #ifndef DOXYGEN_SHOULD_SKIP_THIS
01412 
01413         struct sparse_feature_iterator
01414         {
01416             SGSparseVector<ST> sv;
01417 
01419             int32_t index;
01420 
01422             void print_info()
01423             {
01424                 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01425                         sv.features, sv.vec_index, sv.num_feat_entries, index);
01426             }
01427         };
01428         #endif
01429 
01441         virtual void* get_feature_iterator(int32_t vector_index)
01442         {
01443             if (vector_index>=get_num_vectors())
01444             {
01445                 SG_ERROR("Index out of bounds (number of vectors %d, you "
01446                         "requested %d)\n", get_num_vectors(), vector_index);
01447             }
01448 
01449             if (!sparse_feature_matrix)
01450                 SG_ERROR("Requires a in-memory feature matrix\n");
01451 
01452             sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
01453             it->sv=get_sparse_feature_vector(vector_index);
01454             it->index=0;
01455 
01456             return it;
01457         }
01458 
01469         virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01470         {
01471             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01472             if (!it || it->index>=it->sv.num_feat_entries)
01473                 return false;
01474 
01475             int32_t i=it->index++;
01476 
01477             index=it->sv.features[i].feat_index;
01478             value=(float64_t) it->sv.features[i].entry;
01479 
01480             return true;
01481         }
01482 
01488         virtual void free_feature_iterator(void* iterator)
01489         {
01490             if (!iterator)
01491                 return;
01492 
01493             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01494             free_sparse_feature_vector(it->sv, it->sv.vec_index);
01495             SG_FREE(it);
01496         }
01497 
01504         virtual CFeatures* copy_subset(SGVector<index_t> indices)
01505         {
01506             SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01507                 get_dim_feature_space());
01508 
01509             for (index_t i=0; i<indices.vlen; ++i)
01510             {
01511                 /* index to copy */
01512                 index_t index=indices.vector[i];
01513 
01514                 /* copy sparse vector TODO THINK ABOUT VECTOR INDEX (i or vec.index*/
01515                 SGSparseVector<ST> current=get_sparse_feature_vector(index);
01516                 matrix_copy.sparse_matrix[i]=SGSparseVector<ST>(
01517                     current.num_feat_entries, current.vec_index);
01518 
01519                 /* copy entries */
01520                 memcpy(matrix_copy.sparse_matrix[i].features, current.features,
01521                     sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries);
01522 
01523                 free_sparse_feature_vector(current, index);
01524             }
01525 
01526             return new CSparseFeatures<ST>(matrix_copy);
01527         }
01528 
01530         inline virtual const char* get_name() const { return "SparseFeatures"; }
01531 
01532     protected:
01543         virtual SGSparseVectorEntry<ST>* compute_sparse_feature_vector(int32_t num,
01544             int32_t& len, SGSparseVectorEntry<ST>* target=NULL)
01545         {
01546             SG_NOTIMPLEMENTED;
01547 
01548             len=0;
01549             return NULL;
01550         }
01551 
01552     private:
01553         void init(void)
01554         {
01555             set_generic<ST>();
01556 
01557             m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01558                     "sparse_feature_matrix",
01559                     "Array of sparse vectors.");
01560             m_parameters->add(&num_features, "num_features",
01561                     "Total number of features.");
01562         }
01563 
01564 
01565     protected:
01566 
01568         int32_t num_vectors;
01569 
01571         int32_t num_features;
01572 
01574         SGSparseVector<ST>* sparse_feature_matrix;
01575 
01577         CCache< SGSparseVectorEntry<ST> >* feature_cache;
01578 };
01579 
01580 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01581 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01582 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01583 {                                                                           \
01584     return f_type;                                                          \
01585 }
01586 GET_FEATURE_TYPE(bool, F_BOOL)
01587 GET_FEATURE_TYPE(char, F_CHAR)
01588 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01589 GET_FEATURE_TYPE(int8_t, F_BYTE)
01590 GET_FEATURE_TYPE(int16_t, F_SHORT)
01591 GET_FEATURE_TYPE(uint16_t, F_WORD)
01592 GET_FEATURE_TYPE(int32_t, F_INT)
01593 GET_FEATURE_TYPE(uint32_t, F_UINT)
01594 GET_FEATURE_TYPE(int64_t, F_LONG)
01595 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01596 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01597 GET_FEATURE_TYPE(float64_t, F_DREAL)
01598 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01599 #undef GET_FEATURE_TYPE
01600 
01601 #define LOAD(fname, sg_type)                                            \
01602 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader)    \
01603 {                                                                       \
01604     remove_subset();                                                    \
01605     SG_SET_LOCALE_C;                                                    \
01606     ASSERT(loader);                                                     \
01607     SGSparseVector<sg_type>* matrix=NULL;                                       \
01608     int32_t num_feat=0;                                                 \
01609     int32_t num_vec=0;                                                  \
01610     loader->fname(matrix, num_feat, num_vec);                           \
01611     set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec));              \
01612     SG_RESET_LOCALE;                                                    \
01613 }
01614 LOAD(get_sparse_matrix, bool)
01615 LOAD(get_sparse_matrix, char)
01616 LOAD(get_sparse_matrix, uint8_t)
01617 LOAD(get_int8_sparsematrix, int8_t)
01618 LOAD(get_sparse_matrix, int16_t)
01619 LOAD(get_sparse_matrix, uint16_t)
01620 LOAD(get_sparse_matrix, int32_t)
01621 LOAD(get_uint_sparsematrix, uint32_t)
01622 LOAD(get_long_sparsematrix, int64_t)
01623 LOAD(get_ulong_sparsematrix, uint64_t)
01624 LOAD(get_sparse_matrix, float32_t)
01625 LOAD(get_sparse_matrix, float64_t)
01626 LOAD(get_longreal_sparsematrix, floatmax_t)
01627 #undef LOAD
01628 
01629 #define WRITE(fname, sg_type)                                           \
01630 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer)    \
01631 {                                                                       \
01632     if (m_subset)                                                       \
01633         SG_ERROR("save() not allowed with subset\n");                   \
01634     SG_SET_LOCALE_C;                                                    \
01635     ASSERT(writer);                                                     \
01636     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01637     SG_RESET_LOCALE;                                                    \
01638 }
01639 WRITE(set_sparse_matrix, bool)
01640 WRITE(set_sparse_matrix, char)
01641 WRITE(set_sparse_matrix, uint8_t)
01642 WRITE(set_int8_sparsematrix, int8_t)
01643 WRITE(set_sparse_matrix, int16_t)
01644 WRITE(set_sparse_matrix, uint16_t)
01645 WRITE(set_sparse_matrix, int32_t)
01646 WRITE(set_uint_sparsematrix, uint32_t)
01647 WRITE(set_long_sparsematrix, int64_t)
01648 WRITE(set_ulong_sparsematrix, uint64_t)
01649 WRITE(set_sparse_matrix, float32_t)
01650 WRITE(set_sparse_matrix, float64_t)
01651 WRITE(set_longreal_sparsematrix, floatmax_t)
01652 #undef WRITE
01653 #endif // DOXYGEN_SHOULD_SKIP_THIS
01654 }
01655 #endif /* _SPARSEFEATURES__H__ */