SHOGUN: SparseFeatures.h Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2010 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  * Copyright (C) 2010 Berlin Institute of Technology
00011  */
00012 
00013 #ifndef _SPARSEFEATURES__H__
00014 #define _SPARSEFEATURES__H__
00015 
00016 #include <string.h>
00017 #include <stdlib.h>
00018 
00019 #include "lib/common.h"
00020 #include "lib/Mathematics.h"
00021 #include "lib/Cache.h"
00022 #include "lib/io.h"
00023 #include "lib/Cache.h"
00024 #include "lib/File.h"
00025 #include "lib/DataType.h"
00026 
00027 #include "features/Labels.h"
00028 #include "features/Features.h"
00029 #include "features/DotFeatures.h"
00030 #include "features/SimpleFeatures.h"
00031 #include "preproc/SparsePreProc.h"
00032 
00033 namespace shogun
00034 {
00035 
00036 class CFile;
00037 class CLabels;
00038 class CFeatures;
00039 class CDotFeatures;
00040 template <class ST> class CSimpleFeatures;
00041 template <class ST> class CSparsePreProc;
00042 
00055 template <class ST> class CSparseFeatures : public CDotFeatures
00056 {
00057     void init(void) {
00058         set_generic<ST>();
00059 
00060         m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
00061                                  "sparse_feature_matrix",
00062                                  "Array of sparse vectors.");
00063         m_parameters->add(&num_features, "num_features",
00064                           "Total number of features.");
00065     }
00066 
00067     public:
00072         CSparseFeatures(int32_t size=0)
00073         : CDotFeatures(size), num_vectors(0), num_features(0),
00074             sparse_feature_matrix(NULL), feature_cache(NULL)
00075         { init(); }
00076 
00085         CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00086         : CDotFeatures(0), num_vectors(0), num_features(0),
00087             sparse_feature_matrix(NULL), feature_cache(NULL)
00088         {
00089             init();
00090 
00091             if (!copy)
00092                 set_sparse_feature_matrix(src, num_feat, num_vec);
00093             else
00094             {
00095                 sparse_feature_matrix = new TSparse<ST>[num_vec];
00096                 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00097                 for (int32_t i=0; i< num_vec; i++)
00098                 {
00099                     sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00100                     memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00101 
00102                 }
00103             }
00104         }
00105 
00113         CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00114         : CDotFeatures(0), num_vectors(0), num_features(0),
00115             sparse_feature_matrix(NULL), feature_cache(NULL)
00116         {
00117             init();
00118 
00119             set_full_feature_matrix(src, num_feat, num_vec);
00120         }
00121 
00123         CSparseFeatures(const CSparseFeatures & orig)
00124         : CDotFeatures(orig), num_vectors(orig.num_vectors),
00125             num_features(orig.num_features),
00126             sparse_feature_matrix(orig.sparse_feature_matrix),
00127             feature_cache(orig.feature_cache)
00128         {
00129             init();
00130 
00131             if (orig.sparse_feature_matrix)
00132             {
00133                 free_sparse_feature_matrix();
00134                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00135                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00136                 for (int32_t i=0; i< num_vectors; i++)
00137                 {
00138                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00139                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00140 
00141                 }
00142             }
00143         }
00144 
00149         CSparseFeatures(CFile* loader)
00150         : CDotFeatures(loader), num_vectors(0), num_features(0),
00151             sparse_feature_matrix(NULL), feature_cache(NULL)
00152         {
00153             init();
00154 
00155             load(loader);
00156         }
00157 
00159         virtual ~CSparseFeatures()
00160         {
00161             free_sparse_features();
00162         }
00163 
00167         void free_sparse_feature_matrix()
00168         {
00169             clean_tsparse(sparse_feature_matrix, num_vectors);
00170             sparse_feature_matrix = NULL;
00171             num_vectors=0;
00172             num_features=0;
00173         }
00174 
00178         void free_sparse_features()
00179         {
00180             free_sparse_feature_matrix();
00181             delete feature_cache;
00182             feature_cache = NULL;
00183         }
00184 
00189         virtual CFeatures* duplicate() const
00190         {
00191             return new CSparseFeatures<ST>(*this);
00192         }
00193 
00201         ST get_feature(int32_t num, int32_t index)
00202         {
00203             ASSERT(index>=0 && index<num_features) ;
00204             ASSERT(num>=0 && num<num_vectors) ;
00205 
00206             bool vfree;
00207             int32_t num_feat;
00208             int32_t i;
00209             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00210             ST ret = 0 ;
00211             
00212             if (sv)
00213             {
00214                 for (i=0; i<num_feat; i++)
00215                     if (sv[i].feat_index==index)
00216                         ret += sv[i].entry ;
00217             }
00218             
00219             free_sparse_feature_vector(sv, num, vfree);
00220             
00221             return ret ;
00222         }
00223         
00224 
00233         ST* get_full_feature_vector(int32_t num, int32_t& len)
00234         {
00235             bool vfree;
00236             int32_t num_feat;
00237             int32_t i;
00238             len=0;
00239             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00240             ST* fv=NULL;
00241 
00242             if (sv)
00243             {
00244                 len=num_features;
00245                 fv=new ST[num_features];
00246 
00247                 for (i=0; i<num_features; i++)
00248                     fv[i]=0;
00249 
00250                 for (i=0; i<num_feat; i++)
00251                     fv[sv[i].feat_index]= sv[i].entry;
00252             }
00253 
00254             free_sparse_feature_vector(sv, num, vfree);
00255 
00256             return fv;
00257         }
00258 
00265         void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00266         {
00267             if (num>=num_vectors)
00268             {
00269                 SG_ERROR("Index out of bounds (number of vectors %d, you "
00270                         "requested %d)\n", num_vectors, num);
00271             }
00272 
00273             bool vfree;
00274             int32_t num_feat=0;
00275             *len=0;
00276             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00277 
00278             if (sv)
00279             {
00280                 *len=num_features;
00281                 *dst= (ST*) malloc(sizeof(ST)*num_features);
00282                 memset(*dst, 0, sizeof(ST)*num_features);
00283 
00284                 for (int32_t i=0; i<num_feat; i++)
00285                     (*dst)[sv[i].feat_index]= sv[i].entry;
00286             }
00287 
00288             free_sparse_feature_vector(sv, num, vfree);
00289         }
00290 
00291 
00297         virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00298         {
00299             bool vfree;
00300             int32_t len;
00301             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00302             free_sparse_feature_vector(sv, num, vfree);
00303             return len;
00304         }
00305 
00316         TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00317         {
00318             ASSERT(num<num_vectors);
00319 
00320             if (sparse_feature_matrix)
00321             {
00322                 len= sparse_feature_matrix[num].num_feat_entries;
00323                 vfree=false ;
00324                 return sparse_feature_matrix[num].features;
00325             } 
00326             else
00327             {
00328                 TSparseEntry<ST>* feat=NULL;
00329                 vfree=false;
00330 
00331                 if (feature_cache)
00332                 {
00333                     feat=feature_cache->lock_entry(num);
00334 
00335                     if (feat)
00336                         return feat;
00337                     else
00338                     {
00339                         feat=feature_cache->set_entry(num);
00340                     }
00341                 }
00342 
00343                 if (!feat)
00344                     vfree=true;
00345 
00346                 feat=compute_sparse_feature_vector(num, len, feat);
00347 
00348 
00349                 if (get_num_preproc())
00350                 {
00351                     int32_t tmp_len=len;
00352                     TSparseEntry<ST>* tmp_feat_before = feat;
00353                     TSparseEntry<ST>* tmp_feat_after = NULL;
00354 
00355                     for (int32_t i=0; i<get_num_preproc(); i++)
00356                     {
00357                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00358 
00359                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00360                             delete[] tmp_feat_before;
00361                         tmp_feat_before=tmp_feat_after;
00362                     }
00363 
00364                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00365                     delete[] tmp_feat_after;
00366                     len=tmp_len ;
00367                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00368                 }
00369                 return feat ;
00370             }
00371         }
00372 
00373 
00384         static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00385         {
00386             ST result=0;
00387 
00388             //result remains zero when one of the vectors is non existent
00389             if (avec && bvec)
00390             {
00391                 if (alen<=blen)
00392                 {
00393                     int32_t j=0;
00394                     for (int32_t i=0; i<alen; i++)
00395                     {
00396                         int32_t a_feat_idx=avec[i].feat_index;
00397 
00398                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00399                             j++;
00400 
00401                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00402                         {
00403                             result+= avec[i].entry * bvec[j].entry;
00404                             j++;
00405                         }
00406                     }
00407                 }
00408                 else
00409                 {
00410                     int32_t j=0;
00411                     for (int32_t i=0; i<blen; i++)
00412                     {
00413                         int32_t b_feat_idx=bvec[i].feat_index;
00414 
00415                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00416                             j++;
00417 
00418                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00419                         {
00420                             result+= bvec[i].entry * avec[j].entry;
00421                             j++;
00422                         }
00423                     }
00424                 }
00425 
00426                 result*=alpha;
00427             }
00428 
00429             return result;
00430         }
00431 
00442         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00443         {
00444             ASSERT(vec);
00445             ASSERT(dim==num_features);
00446             ST result=b;
00447 
00448             bool vfree;
00449             int32_t num_feat;
00450             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00451 
00452             if (sv)
00453             {
00454                 for (int32_t i=0; i<num_feat; i++)
00455                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00456             }
00457 
00458             free_sparse_feature_vector(sv, num, vfree);
00459             return result;
00460         }
00461 
00471         void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00472         {
00473             ASSERT(vec);
00474             if (dim!=num_features)
00475             {
00476                 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00477                         dim, num_features);
00478             }
00479 
00480             bool vfree;
00481             int32_t num_feat;
00482             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00483 
00484             if (sv)
00485             {
00486                 if (abs_val)
00487                 {
00488                     for (int32_t i=0; i<num_feat; i++)
00489                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00490                 }
00491                 else
00492                 {
00493                     for (int32_t i=0; i<num_feat; i++)
00494                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00495                 }
00496             }
00497 
00498             free_sparse_feature_vector(sv, num, vfree);
00499         }
00500 
00507         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00508         {
00509             if (feature_cache)
00510                 feature_cache->unlock_entry(num);
00511 
00512             if (free)
00513                 delete[] feat_vec ;
00514         } 
00515 
00523         TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00524         {
00525             num_feat=num_features;
00526             num_vec=num_vectors;
00527 
00528             return sparse_feature_matrix;
00529         }
00530 
00539         void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00540                 int32_t* num_vec, int64_t* nnz)
00541         {
00542             *nnz=get_num_nonzero_entries();
00543             *num_feat=num_features;
00544             *num_vec=num_vectors;
00545             *dst=sparse_feature_matrix;
00546         }
00547 
00553         void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00554         {
00555             if (sfm)
00556             {
00557                 for (int32_t i=0; i<num_vec; i++)
00558                     delete[] sfm[i].features;
00559 
00560                 delete[] sfm;
00561             }
00562         }
00563 
00568         CSparseFeatures<ST>* get_transposed()
00569         {
00570             int32_t num_feat;
00571             int32_t num_vec;
00572             TSparse<ST>* s=get_transposed(num_feat, num_vec);
00573             return new CSparseFeatures<ST>(s, num_feat, num_vec);
00574         }
00575 
00585         TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00586         {
00587             num_feat=num_vectors;
00588             num_vec=num_features;
00589 
00590             int32_t* hist=new int32_t[num_features];
00591             memset(hist, 0, sizeof(int32_t)*num_features);
00592 
00593             // count how lengths of future feature vectors
00594             for (int32_t v=0; v<num_vectors; v++)
00595             {
00596                 int32_t vlen;
00597                 bool vfree;
00598                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00599 
00600                 for (int32_t i=0; i<vlen; i++)
00601                     hist[sv[i].feat_index]++;
00602 
00603                 free_sparse_feature_vector(sv, v, vfree);
00604             }
00605 
00606             // allocate room for future feature vectors
00607             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00608             for (int32_t v=0; v<num_vec; v++)
00609             {
00610                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00611                 sfm[v].num_feat_entries=hist[v];
00612                 sfm[v].vec_index=v;
00613             }
00614 
00615             // fill future feature vectors with content
00616             memset(hist,0,sizeof(int32_t)*num_features);
00617             for (int32_t v=0; v<num_vectors; v++)
00618             {
00619                 int32_t vlen;
00620                 bool vfree;
00621                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00622 
00623                 for (int32_t i=0; i<vlen; i++)
00624                 {
00625                     int32_t vidx=sv[i].feat_index;
00626                     int32_t fidx=v;
00627                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00628                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00629                     hist[vidx]++;
00630                 }
00631 
00632                 free_sparse_feature_vector(sv, v, vfree);
00633             }
00634 
00635             delete[] hist;
00636             return sfm;
00637         }
00638 
00648         virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00649         {
00650             free_sparse_feature_matrix();
00651 
00652             sparse_feature_matrix=src;
00653             num_features=num_feat;
00654             num_vectors=num_vec;
00655         }
00656 
00664         ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00665         {
00666             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00667             num_feat=num_features;
00668             num_vec=num_vectors;
00669 
00670             ST* fm=new ST[num_feat*num_vec];
00671 
00672             if (fm)
00673             {
00674                 for (int64_t i=0; i<num_feat*num_vec; i++)
00675                     fm[i]=0;
00676 
00677                 for (int32_t v=0; v<num_vec; v++)
00678                 {
00679                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00680                     {
00681                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00682                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00683                     }
00684                 }
00685             }
00686             else
00687                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00688 
00689             return fm;
00690         }
00691 
00699         void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00700         {
00701             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00702             *num_feat=num_features;
00703             *num_vec=num_vectors;
00704 
00705             *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00706 
00707             if (*dst)
00708             {
00709                 for (int64_t i=0; i<num_features*num_vectors; i++)
00710                     (*dst)[i]=0;
00711 
00712                 for (int32_t v=0; v<num_vectors; v++)
00713                 {
00714                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00715                     {
00716                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00717                         (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00718                     }
00719                 }
00720             }
00721             else
00722                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00723         }
00724 
00734         virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00735         {
00736             free_sparse_feature_matrix();
00737             bool result=true;
00738             num_features=num_feat;
00739             num_vectors=num_vec;
00740 
00741             SG_INFO("converting dense feature matrix to sparse one\n");
00742             int32_t* num_feat_entries=new int[num_vectors];
00743 
00744             if (num_feat_entries)
00745             {
00746                 int64_t num_total_entries=0;
00747 
00748                 // count nr of non sparse features
00749                 for (int32_t i=0; i< num_vec; i++)
00750                 {
00751                     num_feat_entries[i]=0;
00752                     for (int32_t j=0; j< num_feat; j++)
00753                     {
00754                         if (src[i*((int64_t) num_feat) + j] != 0)
00755                             num_feat_entries[i]++;
00756                     }
00757                 }
00758 
00759                 if (num_vec>0)
00760                 {
00761                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00762 
00763                     if (sparse_feature_matrix)
00764                     {
00765                         for (int32_t i=0; i< num_vec; i++)
00766                         {
00767                             sparse_feature_matrix[i].vec_index=i;
00768                             sparse_feature_matrix[i].num_feat_entries=0;
00769                             sparse_feature_matrix[i].features= NULL;
00770 
00771                             if (num_feat_entries[i]>0)
00772                             {
00773                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00774 
00775                                 if (!sparse_feature_matrix[i].features)
00776                                 {
00777                                     SG_INFO( "allocation of features failed\n");
00778                                     return false;
00779                                 }
00780 
00781                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00782                                 int32_t sparse_feat_idx=0;
00783 
00784                                 for (int32_t j=0; j< num_feat; j++)
00785                                 {
00786                                     int64_t pos= i*num_feat + j;
00787 
00788                                     if (src[pos] != 0)
00789                                     {
00790                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00791                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00792                                         sparse_feat_idx++;
00793                                         num_total_entries++;
00794                                     }
00795                                 }
00796                             }
00797                         }
00798                     }
00799                     else
00800                     {
00801                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00802                         result=false;
00803                     }
00804 
00805                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00806                             num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00807                 }
00808                 else
00809                 {
00810                     SG_ERROR( "huh ? zero size matrix given ?\n");
00811                     result=false;
00812                 }
00813             }
00814             delete[] num_feat_entries;
00815             return result;
00816         }
00817 
00823         virtual bool apply_preproc(bool force_preprocessing=false)
00824         {
00825             SG_INFO( "force: %d\n", force_preprocessing);
00826 
00827             if ( sparse_feature_matrix && get_num_preproc() )
00828             {
00829                 for (int32_t i=0; i<get_num_preproc(); i++)
00830                 {
00831                     if ( (!is_preprocessed(i) || force_preprocessing) )
00832                     {
00833                         set_preprocessed(i);
00834                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00835                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00836                             return false;
00837                     }
00838                     return true;
00839                 }
00840                 return true;
00841             }
00842             else
00843             {
00844                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00845                 return false;
00846             }
00847         }
00848 
00853         virtual int32_t get_size() { return sizeof(ST); }
00854 
00860         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00861         {
00862             int32_t num_feat=0;
00863             int32_t num_vec=0;
00864             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00865             ASSERT(fm && num_feat>0 && num_vec>0);
00866 
00867             return set_full_feature_matrix(fm, num_feat, num_vec);
00868         }
00869 
00874         virtual inline int32_t  get_num_vectors() { return num_vectors; }
00875 
00880         inline int32_t  get_num_features() { return num_features; }
00881 
00893         inline int32_t set_num_features(int32_t num)
00894         {
00895             int32_t n=num_features;
00896             ASSERT(n<=num);
00897             num_features=num;
00898             return num_features;
00899         }
00900 
00905         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00906 
00911         inline virtual EFeatureType get_feature_type();
00912 
00919         void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00920         {
00921             if (feature_cache)
00922                 feature_cache->unlock_entry(num);
00923 
00924             if (free)
00925                 delete[] feat_vec ;
00926         }
00927 
00932         int64_t get_num_nonzero_entries()
00933         {
00934             int64_t num=0;
00935             for (int32_t i=0; i<num_vectors; i++)
00936                 num+=sparse_feature_matrix[i].num_feat_entries;
00937 
00938             return num;
00939         }
00940 
00946         float64_t* compute_squared(float64_t* sq)
00947         {
00948             ASSERT(sq);
00949 
00950             int32_t len=0;
00951             bool do_free=false;
00952 
00953             for (int32_t i=0; i<this->get_num_vectors(); i++)
00954             {
00955                 sq[i]=0;
00956                 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00957 
00958                 for (int32_t j=0; j<len; j++)
00959                     sq[i] += vec[j].entry * vec[j].entry;
00960 
00961                 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00962             }
00963 
00964             return sq;
00965         }
00966 
00979         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00980         {
00981             int32_t i,j;
00982             int32_t alen, blen;
00983             bool afree, bfree;
00984             ASSERT(lhs);
00985             ASSERT(rhs);
00986 
00987             TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00988             TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00989             ASSERT(avec);
00990             ASSERT(bvec);
00991 
00992             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00993 
00994             if (alen<=blen)
00995             {
00996                 j=0;
00997                 for (i=0; i<alen; i++)
00998                 {
00999                     int32_t a_feat_idx=avec[i].feat_index;
01000 
01001                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
01002                         j++;
01003 
01004                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
01005                     {
01006                         result-=2*(avec[i].entry*bvec[j].entry);
01007                         j++;
01008                     }
01009                 }
01010             }
01011             else
01012             {
01013                 j=0;
01014                 for (i=0; i<blen; i++)
01015                 {
01016                     int32_t b_feat_idx=bvec[i].feat_index;
01017 
01018                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01019                         j++;
01020 
01021                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01022                     {
01023                         result-=2*(bvec[i].entry*avec[j].entry);
01024                         j++;
01025                     }
01026                 }
01027             }
01028 
01029             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01030             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01031 
01032             return CMath::abs(result);
01033         }
01034 
01039         void load(CFile* loader);
01040 
01045         void save(CFile* writer);
01046 
01054         CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01055         {
01056             CLabels* lab=NULL;
01057 
01058             size_t blocksize=1024*1024;
01059             size_t required_blocksize=blocksize;
01060             uint8_t* dummy=new uint8_t[blocksize];
01061             FILE* f=fopen(fname, "ro");
01062 
01063             if (f)
01064             {
01065                 free_sparse_feature_matrix();
01066                 num_vectors=0;
01067                 num_features=0;
01068 
01069                 SG_INFO("counting line numbers in file %s\n", fname);
01070                 size_t sz=blocksize;
01071                 size_t block_offs=0;
01072                 size_t old_block_offs=0;
01073                 fseek(f, 0, SEEK_END);
01074                 size_t fsize=ftell(f);
01075                 rewind(f);
01076 
01077                 while (sz == blocksize)
01078                 {
01079                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01080                     bool contains_cr=false;
01081                     for (size_t i=0; i<sz; i++)
01082                     {
01083                         block_offs++;
01084                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01085                         {
01086                             num_vectors++;
01087                             contains_cr=true;
01088                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01089                             old_block_offs=block_offs;
01090                         }
01091                     }
01092                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01093                 }
01094 
01095                 SG_INFO("found %d feature vectors\n", num_vectors);
01096                 delete[] dummy;
01097                 blocksize=required_blocksize;
01098                 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
01099 
01100                 lab=new CLabels(num_vectors);
01101                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01102 
01103                 rewind(f);
01104                 sz=blocksize;
01105                 int32_t lines=0;
01106                 while (sz == blocksize)
01107                 {
01108                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01109 
01110                     size_t old_sz=0;
01111                     for (size_t i=0; i<sz; i++)
01112                     {
01113                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01114                         {
01115                             size_t len=i-old_sz+1;
01116                             uint8_t* data=&dummy[old_sz];
01117 
01118                             for (int32_t j=0; j<len; j++)
01119                                 dummy[j]=data[j];
01120 
01121                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01122                             i=0;
01123                             old_sz=0;
01124                             sz+=len;
01125                         }
01126 
01127                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01128                         {
01129 
01130                             size_t len=i-old_sz;
01131                             uint8_t* data=&dummy[old_sz];
01132 
01133                             int32_t dims=0;
01134                             for (int32_t j=0; j<len; j++)
01135                             {
01136                                 if (data[j]==':')
01137                                     dims++;
01138                             }
01139 
01140                             if (dims<=0)
01141                             {
01142                                 SG_ERROR("Error in line %d - number of"
01143                                         " dimensions is %d line is %d characters"
01144                                         " long\n line_content:'%.*s'\n", lines,
01145                                         dims, len, len, (const char*) data);
01146                             }
01147 
01148                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01149                             int32_t j=0;
01150                             for (; j<len; j++)
01151                             {
01152                                 if (data[j]==' ')
01153                                 {
01154                                     data[j]='\0';
01155 
01156                                     lab->set_label(lines, atof((const char*) data));
01157                                     break;
01158                                 }
01159                             }
01160 
01161                             int32_t d=0;
01162                             j++;
01163                             uint8_t* start=&data[j];
01164                             for (; j<len; j++)
01165                             {
01166                                 if (data[j]==':')
01167                                 {
01168                                     data[j]='\0';
01169 
01170                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01171                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
01172 
01173                                     j++;
01174                                     start=&data[j];
01175                                     for (; j<len; j++)
01176                                     {
01177                                         if (data[j]==' ' || data[j]=='\n')
01178                                         {
01179                                             data[j]='\0';
01180                                             feat[d].entry=(ST) atof((const char*) start);
01181                                             d++;
01182                                             break;
01183                                         }
01184                                     }
01185 
01186                                     if (j==len)
01187                                     {
01188                                         data[j]='\0';
01189                                         feat[dims-1].entry=(ST) atof((const char*) start);
01190                                     }
01191 
01192                                     j++;
01193                                     start=&data[j];
01194                                 }
01195                             }
01196 
01197                             sparse_feature_matrix[lines].vec_index=lines;
01198                             sparse_feature_matrix[lines].num_feat_entries=dims;
01199                             sparse_feature_matrix[lines].features=feat;
01200 
01201                             old_sz=i+1;
01202                             lines++;
01203                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01204                         }
01205                     }
01206                 }
01207                 SG_INFO("file successfully read\n");
01208                 fclose(f);
01209             }
01210 
01211             delete[] dummy;
01212 
01213             if (do_sort_features)
01214                 sort_features();
01215 
01216             return lab;
01217         }
01218 
01221         void sort_features()
01222         {
01223             ASSERT(get_num_preproc()==0);
01224 
01225             if (!sparse_feature_matrix)
01226                 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01227 
01228             for (int32_t i=0; i<num_vectors; i++)
01229             {
01230                 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01231 
01232                 if (!len)
01233                     continue;
01234 
01235                 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01236                 int32_t* feat_idx=new int32_t[len];
01237                 int32_t* orig_idx=new int32_t[len];
01238 
01239                 for (int j=0; j<len; j++)
01240                 {
01241                     feat_idx[j]=sf_orig[j].feat_index;
01242                     orig_idx[j]=j;
01243                 }
01244 
01245                 CMath::qsort_index(feat_idx, orig_idx, len);
01246 
01247                 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01248                 for (int j=0; j<len; j++)
01249                     sf_new[j]=sf_orig[orig_idx[j]];
01250 
01251                 sparse_feature_matrix[i].features=sf_new;
01252 
01253                 // sanity check
01254                 for (int j=0; j<len-1; j++)
01255                     ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01256 
01257                 delete[] orig_idx;
01258                 delete[] feat_idx;
01259                 delete[] sf_orig;
01260             }
01261         }
01262 
01269         bool write_svmlight_file(char* fname, CLabels* label)
01270         {
01271             ASSERT(label);
01272             int32_t num=label->get_num_labels();
01273             ASSERT(num>0);
01274             ASSERT(num==num_vectors);
01275 
01276             FILE* f=fopen(fname, "wb");
01277 
01278             if (f)
01279             {
01280                 for (int32_t i=0; i<num; i++)
01281                 {
01282                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01283 
01284                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01285                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01286 
01287                     for (int32_t j=0; j<num_feat; j++)
01288                     {
01289                         if (j<num_feat-1)
01290                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01291                         else
01292                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01293                     }
01294                 }
01295 
01296                 fclose(f);
01297                 return true;
01298             }
01299             return false;
01300         }
01301 
01309         virtual int32_t get_dim_feature_space()
01310         {
01311             return num_features;
01312         }
01313 
01321         virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
01322         {
01323             ASSERT(df);
01324             ASSERT(df->get_feature_type() == get_feature_type());
01325             ASSERT(df->get_feature_class() == get_feature_class());
01326             CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01327 
01328             bool afree, bfree;
01329             int32_t alen, blen;
01330             TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01331             TSparseEntry<ST>* bvec=sf->get_sparse_feature_vector(vec_idx2, blen, bfree);
01332 
01333             float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01334 
01335             free_sparse_feature_vector(avec, vec_idx1, afree);
01336             sf->free_sparse_feature_vector(bvec, vec_idx2, bfree);
01337 
01338             return result;
01339         }
01340 
01347         virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01348         {
01349             ASSERT(vec2);
01350             if (vec2_len!=num_features)
01351             {
01352                 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01353                         vec2_len, num_features);
01354             }
01355             float64_t result=0;
01356 
01357             bool vfree;
01358             int32_t vlen;
01359             TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01360 
01361             if (sv)
01362             {
01363                 for (int32_t i=0; i<vlen; i++)
01364                     result+=vec2[sv[i].feat_index]*sv[i].entry;
01365             }
01366 
01367             free_sparse_feature_vector(sv, vec_idx1, vfree);
01368 
01369             return result;
01370         }
01371 
01373         struct sparse_feature_iterator
01374         {
01376             TSparseEntry<ST>* sv;
01378             int32_t vidx;
01380             int32_t num_feat_entries;
01382             bool vfree;
01383 
01385             int32_t index;
01386 
01388             void print_info()
01389             {
01390                 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01391                         sv, vidx, num_feat_entries, index);
01392             }
01393         };
01394 
01404         virtual void* get_feature_iterator(int32_t vector_index)
01405         {
01406             if (vector_index>=num_vectors)
01407             {
01408                 SG_ERROR("Index out of bounds (number of vectors %d, you "
01409                         "requested %d)\n", num_vectors, vector_index);
01410             }
01411 
01412             if (!sparse_feature_matrix)
01413                 SG_ERROR("Requires a in-memory feature matrix\n");
01414 
01415             sparse_feature_iterator* it=new sparse_feature_iterator[1];
01416             it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree);
01417             it->vidx=vector_index;
01418             it->index=0;
01419 
01420             return it;
01421         }
01422 
01433         virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01434         {
01435             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01436             if (!it || it->index >= it->num_feat_entries)
01437                 return false;
01438 
01439             int32_t i=it->index++;
01440 
01441             index =  it->sv[i].feat_index;
01442             value = (float64_t) it->sv[i].entry;
01443 
01444             return true;
01445         }
01446 
01452         virtual void free_feature_iterator(void* iterator)
01453         {
01454             if (!iterator)
01455                 return;
01456 
01457             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01458             free_sparse_feature_vector(it->sv, it->vidx, it->vfree);
01459             delete[] it;
01460         }
01461 
01463         inline virtual const char* get_name() const { return "SparseFeatures"; }
01464 
01465     protected:
01476         virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01477         {
01478             len=0;
01479             return NULL;
01480         }
01481 
01482     protected:
01483 
01485         int32_t num_vectors;
01486 
01488         int32_t num_features;
01489 
01491         TSparse<ST>* sparse_feature_matrix;
01492 
01494         CCache< TSparseEntry<ST> >* feature_cache;
01495 };
01496 
01497 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01498 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01499 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01500 {                                                                           \
01501     return f_type;                                                          \
01502 }
01503 GET_FEATURE_TYPE(bool, F_BOOL)
01504 GET_FEATURE_TYPE(char, F_CHAR)
01505 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01506 GET_FEATURE_TYPE(int8_t, F_BYTE)
01507 GET_FEATURE_TYPE(int16_t, F_SHORT)
01508 GET_FEATURE_TYPE(uint16_t, F_WORD)
01509 GET_FEATURE_TYPE(int32_t, F_INT)
01510 GET_FEATURE_TYPE(uint32_t, F_UINT)
01511 GET_FEATURE_TYPE(int64_t, F_LONG)
01512 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01513 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01514 GET_FEATURE_TYPE(float64_t, F_DREAL)
01515 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01516 #undef GET_FEATURE_TYPE
01517 
01518 #define LOAD(fname, sg_type)                                            \
01519 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader)    \
01520 {                                                                       \
01521     SG_SET_LOCALE_C;                                                    \
01522     ASSERT(loader);                                                     \
01523     TSparse<sg_type>* matrix=NULL;                                      \
01524     int32_t num_feat=0;                                                 \
01525     int32_t num_vec=0;                                                  \
01526     loader->fname(matrix, num_feat, num_vec);                           \
01527     set_sparse_feature_matrix(matrix, num_feat, num_vec);               \
01528     SG_RESET_LOCALE;                                                    \
01529 }
01530 LOAD(get_bool_sparsematrix, bool)
01531 LOAD(get_char_sparsematrix, char)
01532 LOAD(get_byte_sparsematrix, uint8_t)
01533 LOAD(get_int8_sparsematrix, int8_t)
01534 LOAD(get_short_sparsematrix, int16_t)
01535 LOAD(get_word_sparsematrix, uint16_t)
01536 LOAD(get_int_sparsematrix, int32_t)
01537 LOAD(get_uint_sparsematrix, uint32_t)
01538 LOAD(get_long_sparsematrix, int64_t)
01539 LOAD(get_ulong_sparsematrix, uint64_t)
01540 LOAD(get_shortreal_sparsematrix, float32_t)
01541 LOAD(get_real_sparsematrix, float64_t)
01542 LOAD(get_longreal_sparsematrix, floatmax_t)
01543 #undef LOAD
01544 
01545 #define WRITE(fname, sg_type)                                           \
01546 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer)    \
01547 {                                                                       \
01548     SG_SET_LOCALE_C;                                                    \
01549     ASSERT(writer);                                                     \
01550     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01551     SG_RESET_LOCALE;                                                    \
01552 }
01553 WRITE(set_bool_sparsematrix, bool)
01554 WRITE(set_char_sparsematrix, char)
01555 WRITE(set_byte_sparsematrix, uint8_t)
01556 WRITE(set_int8_sparsematrix, int8_t)
01557 WRITE(set_short_sparsematrix, int16_t)
01558 WRITE(set_word_sparsematrix, uint16_t)
01559 WRITE(set_int_sparsematrix, int32_t)
01560 WRITE(set_uint_sparsematrix, uint32_t)
01561 WRITE(set_long_sparsematrix, int64_t)
01562 WRITE(set_ulong_sparsematrix, uint64_t)
01563 WRITE(set_shortreal_sparsematrix, float32_t)
01564 WRITE(set_real_sparsematrix, float64_t)
01565 WRITE(set_longreal_sparsematrix, floatmax_t)
01566 #undef WRITE
01567 #endif // DOXYGEN_SHOULD_SKIP_THIS
01568 }
01569 #endif /* _SPARSEFEATURES__H__ */