SHOGUN: SparseFeatures.cpp Source File

Go to the documentation of this file.
00001 #include <shogun/features/SparseFeatures.h>
00002 #include <shogun/preprocessor/SparsePreprocessor.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/lib/DataType.h>
00005 #include <shogun/io/SGIO.h>
00006 
00007 #include <string.h>
00008 #include <stdlib.h>
00009 
00010 namespace shogun
00011 {
00012 
00013 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
00014 : CDotFeatures(size), num_vectors(0), num_features(0),
00015     sparse_feature_matrix(NULL), feature_cache(NULL)
00016 {
00017     init();
00018 }
00019 
00020 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src,
00021         int32_t num_feat, int32_t num_vec, bool copy)
00022 : CDotFeatures(0), num_vectors(0), num_features(0),
00023     sparse_feature_matrix(NULL), feature_cache(NULL)
00024 {
00025     init();
00026 
00027     if (!copy)
00028         set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec));
00029     else
00030     {
00031         sparse_feature_matrix = SG_MALLOC(SGSparseVector<ST>, num_vec);
00032         memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec);
00033         for (int32_t i=0; i< num_vec; i++)
00034         {
00035             sparse_feature_matrix[i].features = SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00036             memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00037 
00038         }
00039     }
00040 }
00041 
00042 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse)
00043 : CDotFeatures(0), num_vectors(0), num_features(0),
00044     sparse_feature_matrix(NULL), feature_cache(NULL)
00045 {
00046     init();
00047 
00048     set_sparse_feature_matrix(sparse);
00049 }
00050 
00051 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense)
00052 : CDotFeatures(0), num_vectors(0), num_features(0),
00053     sparse_feature_matrix(NULL), feature_cache(NULL)
00054 {
00055     init();
00056 
00057     set_full_feature_matrix(dense);
00058 }
00059 
00060 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
00061 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00062     num_features(orig.num_features),
00063     sparse_feature_matrix(orig.sparse_feature_matrix),
00064     feature_cache(orig.feature_cache)
00065 {
00066     init();
00067 
00068     if (orig.sparse_feature_matrix)
00069     {
00070         free_sparse_feature_matrix();
00071         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00072         memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors);
00073         for (int32_t i=0; i< num_vectors; i++)
00074         {
00075             sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00076             memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00077 
00078         }
00079     }
00080 
00081     m_subset=orig.m_subset->duplicate();
00082 }
00083 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
00084 : CDotFeatures(loader), num_vectors(0), num_features(0),
00085     sparse_feature_matrix(NULL), feature_cache(NULL)
00086 {
00087     init();
00088 
00089     load(loader);
00090 }
00091 
00092 template<class ST> CSparseFeatures<ST>::~CSparseFeatures()
00093 {
00094     free_sparse_features();
00095 }
00096 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix()
00097 {
00098     clean_tsparse(sparse_feature_matrix, num_vectors);
00099     sparse_feature_matrix = NULL;
00100     num_vectors=0;
00101     num_features=0;
00102     remove_subset();
00103 }
00104 template<class ST> void CSparseFeatures<ST>::free_sparse_features()
00105 {
00106     free_sparse_feature_matrix();
00107     delete feature_cache;
00108     feature_cache = NULL;
00109 }
00110 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
00111 {
00112     return new CSparseFeatures<ST>(*this);
00113 }
00114 
00115 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
00116 {
00117     ASSERT(index>=0 && index<num_features) ;
00118     ASSERT(num>=0 && num<get_num_vectors()) ;
00119 
00120     int32_t i;
00121     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00122     ST ret = 0 ;
00123     
00124     if (sv.features)
00125     {
00126         for (i=0; i<sv.num_feat_entries; i++)
00127             if (sv.features[i].feat_index==index)
00128                 ret+=sv.features[i].entry ;
00129     }
00130     
00131     free_sparse_feature_vector(sv, num);
00132     
00133     return ret ;
00134 }
00135 
00136 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
00137 {
00138     int32_t i;
00139     len=0;
00140     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00141     ST* fv=NULL;
00142 
00143     if (sv.features)
00144     {
00145         len=num_features;
00146         fv=SG_MALLOC(ST, num_features);
00147 
00148         for (i=0; i<num_features; i++)
00149             fv[i]=0;
00150 
00151         for (i=0; i<sv.num_feat_entries; i++)
00152             fv[sv.features[i].feat_index]= sv.features[i].entry;
00153     }
00154 
00155     free_sparse_feature_vector(sv, num);
00156 
00157     return fv;
00158 }
00159 
00160 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num)
00161 {
00162     if (num>=num_vectors)
00163     {
00164         SG_ERROR("Index out of bounds (number of vectors %d, you "
00165                 "requested %d)\n", num_vectors, num);
00166     }
00167 
00168     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00169 
00170     SGVector<ST> dense;
00171 
00172     if (sv.features)
00173     {
00174         dense.do_free=true;
00175         dense.vlen=num_features;
00176         dense.vector=SG_MALLOC(ST, num_features);
00177         memset(dense.vector, 0, sizeof(ST)*num_features);
00178 
00179         for (int32_t i=0; i<sv.num_feat_entries; i++)
00180         {
00181             dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00182         }
00183     }
00184 
00185     free_sparse_feature_vector(sv, num);
00186 
00187     return dense;
00188 }
00189 
00190 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00191 {
00192     SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00193     int32_t len=sv.num_feat_entries;
00194     free_sparse_feature_vector(sv, num);
00195     return len;
00196 }
00197 
00198 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num)
00199 {
00200     ASSERT(num<get_num_vectors());
00201 
00202     index_t real_num=subset_idx_conversion(num);
00203 
00204     SGSparseVector<ST> result;
00205 
00206     if (sparse_feature_matrix)
00207     {
00208         result=sparse_feature_matrix[real_num];
00209         result.do_free=false;
00210         return result;
00211     } 
00212     else
00213     {
00214         result.do_free=false;
00215 
00216         if (feature_cache)
00217         {
00218             result.features=feature_cache->lock_entry(num);
00219 
00220             if (result.features)
00221                 return result;
00222             else
00223             {
00224                 result.features=feature_cache->set_entry(num);
00225             }
00226         }
00227 
00228         if (!result.features)
00229             result.do_free=true;
00230 
00231         result.features=compute_sparse_feature_vector(num,
00232             result.num_feat_entries, result.features);
00233 
00234 
00235         if (get_num_preprocessors())
00236         {
00237             int32_t tmp_len=result.num_feat_entries;
00238             SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00239             SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00240 
00241             for (int32_t i=0; i<get_num_preprocessors(); i++)
00242             {
00243                 //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00244 
00245                 if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00246                     SG_FREE(tmp_feat_before);
00247                 tmp_feat_before=tmp_feat_after;
00248             }
00249 
00250             memcpy(result.features, tmp_feat_after,
00251                     sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00252 
00253             SG_FREE(tmp_feat_after);
00254             result.num_feat_entries=tmp_len ;
00255             SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00256         }
00257         result.vec_index=num;
00258         return result ;
00259     }
00260 }
00261 
00262 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
00263 {
00264     ST result=0;
00265 
00266     //result remains zero when one of the vectors is non existent
00267     if (avec && bvec)
00268     {
00269         if (alen<=blen)
00270         {
00271             int32_t j=0;
00272             for (int32_t i=0; i<alen; i++)
00273             {
00274                 int32_t a_feat_idx=avec[i].feat_index;
00275 
00276                 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00277                     j++;
00278 
00279                 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00280                 {
00281                     result+= avec[i].entry * bvec[j].entry;
00282                     j++;
00283                 }
00284             }
00285         }
00286         else
00287         {
00288             int32_t j=0;
00289             for (int32_t i=0; i<blen; i++)
00290             {
00291                 int32_t b_feat_idx=bvec[i].feat_index;
00292 
00293                 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00294                     j++;
00295 
00296                 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00297                 {
00298                     result+= bvec[i].entry * avec[j].entry;
00299                     j++;
00300                 }
00301             }
00302         }
00303 
00304         result*=alpha;
00305     }
00306 
00307     return result;
00308 }
00309 
00310 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00311 {
00312     ASSERT(vec);
00313     ASSERT(dim==num_features);
00314     ST result=b;
00315 
00316     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00317 
00318     if (sv.features)
00319     {
00320         for (int32_t i=0; i<sv.num_feat_entries; i++)
00321         {
00322             result+=alpha*vec[sv.features[i].feat_index]
00323                 *sv.features[i].entry;
00324         }
00325     }
00326 
00327     free_sparse_feature_vector(sv, num);
00328     return result;
00329 }
00330 
00331 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
00332 {
00333     ASSERT(vec);
00334     if (dim!=num_features)
00335     {
00336         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00337                 dim, num_features);
00338     }
00339 
00340     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00341 
00342     if (sv.features)
00343     {
00344         if (abs_val)
00345         {
00346             for (int32_t i=0; i<sv.num_feat_entries; i++)
00347             {
00348                 vec[sv.features[i].feat_index]+=alpha
00349                     *CMath::abs(sv.features[i].entry);
00350             }
00351         }
00352         else
00353         {
00354             for (int32_t i=0; i<sv.num_feat_entries; i++)
00355             {
00356                 vec[sv.features[i].feat_index]+=alpha
00357                         *sv.features[i].entry;
00358             }
00359         }
00360     }
00361 
00362     free_sparse_feature_vector(sv, num);
00363 }
00364 
00365 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(SGSparseVector<ST> vec, int32_t num)
00366 {
00367     if (feature_cache)
00368         feature_cache->unlock_entry(subset_idx_conversion(num));
00369 
00370     vec.free_vector();
00371 } 
00372 
00373 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00374 {
00375     if (m_subset)
00376         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00377 
00378     num_feat=num_features;
00379     num_vec=num_vectors;
00380 
00381     return sparse_feature_matrix;
00382 }
00383 
00384 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix()
00385 {
00386     if (m_subset)
00387         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00388 
00389     SGSparseMatrix<ST> sm;
00390     sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00391     return sm;
00392 }
00393 
00394 template<class ST> void CSparseFeatures<ST>::clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec)
00395 {
00396     if (sfm)
00397     {
00398         for (int32_t i=0; i<num_vec; i++)
00399             SG_FREE(sfm[i].features);
00400 
00401         SG_FREE(sfm);
00402     }
00403 }
00404 
00405 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed()
00406 {
00407     int32_t num_feat;
00408     int32_t num_vec;
00409     SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00410     return new CSparseFeatures<ST>(s, num_feat, num_vec);
00411 }
00412 
00413 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00414 {
00415     num_feat=get_num_vectors();
00416     num_vec=num_features;
00417 
00418     int32_t* hist=SG_MALLOC(int32_t, num_features);
00419     memset(hist, 0, sizeof(int32_t)*num_features);
00420 
00421     // count how lengths of future feature vectors
00422     for (int32_t v=0; v<num_feat; v++)
00423     {
00424         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00425 
00426         for (int32_t i=0; i<sv.num_feat_entries; i++)
00427             hist[sv.features[i].feat_index]++;
00428 
00429         free_sparse_feature_vector(sv, v);
00430     }
00431 
00432     // allocate room for future feature vectors
00433     SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00434     for (int32_t v=0; v<num_vec; v++)
00435     {
00436         sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]);
00437         sfm[v].num_feat_entries=hist[v];
00438         sfm[v].vec_index=v;
00439     }
00440 
00441     // fill future feature vectors with content
00442     memset(hist,0,sizeof(int32_t)*num_features);
00443     for (int32_t v=0; v<num_feat; v++)
00444     {
00445         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00446 
00447         for (int32_t i=0; i<sv.num_feat_entries; i++)
00448         {
00449             int32_t vidx=sv.features[i].feat_index;
00450             int32_t fidx=v;
00451             sfm[vidx].features[hist[vidx]].feat_index=fidx;
00452             sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00453             hist[vidx]++;
00454         }
00455 
00456         free_sparse_feature_vector(sv, v);
00457     }
00458 
00459     SG_FREE(hist);
00460     return sfm;
00461 }
00462 
00463 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00464 {
00465     if (m_subset)
00466         SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00467 
00468 
00469     free_sparse_feature_matrix();
00470     sm.own_matrix();
00471 
00472     sparse_feature_matrix=sm.sparse_matrix;
00473     num_features=sm.num_features;
00474     num_vectors=sm.num_vectors;
00475 }
00476 
00477 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix()
00478 {
00479     SGMatrix<ST> full;
00480 
00481     SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00482     full.num_rows=num_features;
00483     full.num_cols=get_num_vectors();
00484     full.do_free=true;
00485     full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors());
00486 
00487     memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST));
00488 
00489     for (int32_t v=0; v<full.num_cols; v++)
00490     {
00491         SGSparseVector<ST> current=
00492             sparse_feature_matrix[subset_idx_conversion(v)];
00493 
00494         for (int32_t f=0; f<current.num_feat_entries; f++)
00495         {
00496             int64_t offs=(current.vec_index*num_features)
00497                     +current.features[f].feat_index;
00498 
00499             full.matrix[offs]=current.features[f].entry;
00500         }
00501     }
00502 
00503     return full;
00504 }
00505 
00506 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full)
00507 {
00508     remove_subset();
00509 
00510     ST* src=full.matrix;
00511     int32_t num_feat=full.num_rows;
00512     int32_t num_vec=full.num_cols;
00513 
00514     free_sparse_feature_matrix();
00515     bool result=true;
00516     num_features=num_feat;
00517     num_vectors=num_vec;
00518 
00519     SG_INFO("converting dense feature matrix to sparse one\n");
00520     int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00521 
00522     if (num_feat_entries)
00523     {
00524         int64_t num_total_entries=0;
00525 
00526         // count nr of non sparse features
00527         for (int32_t i=0; i< num_vec; i++)
00528         {
00529             num_feat_entries[i]=0;
00530             for (int32_t j=0; j< num_feat; j++)
00531             {
00532                 if (src[i*((int64_t) num_feat) + j] != 0)
00533                     num_feat_entries[i]++;
00534             }
00535         }
00536 
00537         if (num_vec>0)
00538         {
00539             sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00540 
00541             if (sparse_feature_matrix)
00542             {
00543                 for (int32_t i=0; i< num_vec; i++)
00544                 {
00545                     sparse_feature_matrix[i].vec_index=i;
00546                     sparse_feature_matrix[i].num_feat_entries=0;
00547                     sparse_feature_matrix[i].features= NULL;
00548 
00549                     if (num_feat_entries[i]>0)
00550                     {
00551                         sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]);
00552 
00553                         if (!sparse_feature_matrix[i].features)
00554                         {
00555                             SG_INFO( "allocation of features failed\n");
00556                             return false;
00557                         }
00558 
00559                         sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00560                         int32_t sparse_feat_idx=0;
00561 
00562                         for (int32_t j=0; j< num_feat; j++)
00563                         {
00564                             int64_t pos= i*num_feat + j;
00565 
00566                             if (src[pos] != 0)
00567                             {
00568                                 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00569                                 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00570                                 sparse_feat_idx++;
00571                                 num_total_entries++;
00572                             }
00573                         }
00574                     }
00575                 }
00576             }
00577             else
00578             {
00579                 SG_ERROR( "allocation of sparse feature matrix failed\n");
00580                 result=false;
00581             }
00582 
00583             SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00584                     num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00585         }
00586         else
00587         {
00588             SG_ERROR( "huh ? zero size matrix given ?\n");
00589             result=false;
00590         }
00591     }
00592     SG_FREE(num_feat_entries);
00593     return result;
00594 }
00595 
00596 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
00597 {
00598     SG_INFO( "force: %d\n", force_preprocessing);
00599 
00600     if ( sparse_feature_matrix && get_num_preprocessors() )
00601     {
00602         for (int32_t i=0; i<get_num_preprocessors(); i++)
00603         {
00604             if ( (!is_preprocessed(i) || force_preprocessing) )
00605             {
00606                 set_preprocessed(i);
00607                 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00608                 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00609                     return false;
00610             }
00611             return true;
00612         }
00613         return true;
00614     }
00615     else
00616     {
00617         SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00618         return false;
00619     }
00620 }
00621 
00622 template<class ST> int32_t CSparseFeatures<ST>::get_size()
00623 {
00624     return sizeof(ST);
00625 }
00626 
00627 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CSimpleFeatures<ST>* sf)
00628 {
00629     SGMatrix<ST> fm=sf->get_feature_matrix();
00630     ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00631 
00632     return set_full_feature_matrix(fm);
00633 }
00634 
00635 template<class ST> int32_t  CSparseFeatures<ST>::get_num_vectors() const
00636 {
00637     return m_subset ? m_subset->get_size() : num_vectors;
00638 }
00639 
00640 template<class ST> int32_t  CSparseFeatures<ST>::get_num_features()
00641 {
00642     return num_features;
00643 }
00644 
00645 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
00646 {
00647     int32_t n=num_features;
00648     ASSERT(n<=num);
00649     num_features=num;
00650     return num_features;
00651 }
00652 
00653 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class()
00654 {
00655     return C_SPARSE;
00656 }
00657 
00658 template<class ST> void CSparseFeatures<ST>::free_feature_vector(SGSparseVector<ST> vec, int32_t num)
00659 {
00660     if (feature_cache)
00661         feature_cache->unlock_entry(subset_idx_conversion(num));
00662 
00663     vec.free_vector();
00664 }
00665 
00666 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries()
00667 {
00668     int64_t num=0;
00669     index_t num_vec=get_num_vectors();
00670     for (int32_t i=0; i<num_vec; i++)
00671         num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
00672 
00673     return num;
00674 }
00675 
00676 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq)
00677 {
00678     ASSERT(sq);
00679 
00680     index_t num_vec=get_num_vectors();
00681     for (int32_t i=0; i<num_vec; i++)
00682     {
00683         sq[i]=0;
00684         SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00685 
00686         for (int32_t j=0; j<vec.num_feat_entries; j++)
00687             sq[i]+=vec.features[j].entry*vec.features[j].entry;
00688 
00689         free_feature_vector(vec, i);
00690     }
00691 
00692     return sq;
00693 }
00694 
00695 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm(
00696         CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
00697         CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00698 {
00699     int32_t i,j;
00700     ASSERT(lhs);
00701     ASSERT(rhs);
00702 
00703     SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
00704     SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
00705     ASSERT(avec.features);
00706     ASSERT(bvec.features);
00707 
00708     float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00709 
00710     if (avec.num_feat_entries<=bvec.num_feat_entries)
00711     {
00712         j=0;
00713         for (i=0; i<avec.num_feat_entries; i++)
00714         {
00715             int32_t a_feat_idx=avec.features[i].feat_index;
00716 
00717             while ((j<bvec.num_feat_entries)
00718                     &&(bvec.features[j].feat_index<a_feat_idx))
00719                 j++;
00720 
00721             if ((j<bvec.num_feat_entries)
00722                     &&(bvec.features[j].feat_index==a_feat_idx))
00723             {
00724                 result-=2*(avec.features[i].entry*bvec.features[j].entry);
00725                 j++;
00726             }
00727         }
00728     }
00729     else
00730     {
00731         j=0;
00732         for (i=0; i<bvec.num_feat_entries; i++)
00733         {
00734             int32_t b_feat_idx=bvec.features[i].feat_index;
00735 
00736             while ((j<avec.num_feat_entries)
00737                     &&(avec.features[j].feat_index<b_feat_idx))
00738                 j++;
00739 
00740             if ((j<avec.num_feat_entries)
00741                     &&(avec.features[j].feat_index==b_feat_idx))
00742             {
00743                 result-=2*(bvec.features[i].entry*avec.features[j].entry);
00744                 j++;
00745             }
00746         }
00747     }
00748 
00749     ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a);
00750     ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b);
00751 
00752     return CMath::abs(result);
00753 }
00754 
00755 template<class ST> CLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
00756         bool do_sort_features)
00757 {
00758     remove_subset();
00759 
00760     CLabels* lab=NULL;
00761 
00762     size_t blocksize=1024*1024;
00763     size_t required_blocksize=blocksize;
00764     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00765     FILE* f=fopen(fname, "ro");
00766 
00767     if (f)
00768     {
00769         free_sparse_feature_matrix();
00770         num_vectors=0;
00771         num_features=0;
00772 
00773         SG_INFO("counting line numbers in file %s\n", fname);
00774         size_t sz=blocksize;
00775         size_t block_offs=0;
00776         size_t old_block_offs=0;
00777         fseek(f, 0, SEEK_END);
00778         size_t fsize=ftell(f);
00779         rewind(f);
00780 
00781         while (sz == blocksize)
00782         {
00783             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00784             for (size_t i=0; i<sz; i++)
00785             {
00786                 block_offs++;
00787                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00788                 {
00789                     num_vectors++;
00790                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00791                     old_block_offs=block_offs;
00792                 }
00793             }
00794             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00795         }
00796 
00797         SG_INFO("found %d feature vectors\n", num_vectors);
00798         SG_FREE(dummy);
00799         blocksize=required_blocksize;
00800         dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
00801 
00802         lab=new CLabels(num_vectors);
00803         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00804 
00805         rewind(f);
00806         sz=blocksize;
00807         int32_t lines=0;
00808         while (sz == blocksize)
00809         {
00810             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00811 
00812             size_t old_sz=0;
00813             for (size_t i=0; i<sz; i++)
00814             {
00815                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00816                 {
00817                     size_t len=i-old_sz+1;
00818                     uint8_t* data=&dummy[old_sz];
00819 
00820                     for (size_t j=0; j<len; j++)
00821                         dummy[j]=data[j];
00822 
00823                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00824                     i=0;
00825                     old_sz=0;
00826                     sz+=len;
00827                 }
00828 
00829                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00830                 {
00831 
00832                     size_t len=i-old_sz;
00833                     uint8_t* data=&dummy[old_sz];
00834 
00835                     int32_t dims=0;
00836                     for (size_t j=0; j<len; j++)
00837                     {
00838                         if (data[j]==':')
00839                             dims++;
00840                     }
00841 
00842                     if (dims<=0)
00843                     {
00844                         SG_ERROR("Error in line %d - number of"
00845                                 " dimensions is %d line is %d characters"
00846                                 " long\n line_content:'%.*s'\n", lines,
00847                                 dims, len, len, (const char*) data);
00848                     }
00849 
00850                     SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
00851                     size_t j=0;
00852                     for (; j<len; j++)
00853                     {
00854                         if (data[j]==' ')
00855                         {
00856                             data[j]='\0';
00857 
00858                             lab->set_label(lines, atof((const char*) data));
00859                             break;
00860                         }
00861                     }
00862 
00863                     int32_t d=0;
00864                     j++;
00865                     uint8_t* start=&data[j];
00866                     for (; j<len; j++)
00867                     {
00868                         if (data[j]==':')
00869                         {
00870                             data[j]='\0';
00871 
00872                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00873                             num_features=CMath::max(num_features, feat[d].feat_index+1);
00874 
00875                             j++;
00876                             start=&data[j];
00877                             for (; j<len; j++)
00878                             {
00879                                 if (data[j]==' ' || data[j]=='\n')
00880                                 {
00881                                     data[j]='\0';
00882                                     feat[d].entry=(ST) atof((const char*) start);
00883                                     d++;
00884                                     break;
00885                                 }
00886                             }
00887 
00888                             if (j==len)
00889                             {
00890                                 data[j]='\0';
00891                                 feat[dims-1].entry=(ST) atof((const char*) start);
00892                             }
00893 
00894                             j++;
00895                             start=&data[j];
00896                         }
00897                     }
00898 
00899                     sparse_feature_matrix[lines].vec_index=lines;
00900                     sparse_feature_matrix[lines].num_feat_entries=dims;
00901                     sparse_feature_matrix[lines].features=feat;
00902 
00903                     old_sz=i+1;
00904                     lines++;
00905                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00906                 }
00907             }
00908         }
00909         SG_INFO("file successfully read\n");
00910         fclose(f);
00911     }
00912 
00913     SG_FREE(dummy);
00914 
00915     if (do_sort_features)
00916         sort_features();
00917 
00918     return lab;
00919 }
00920 
00921 template<class ST> void CSparseFeatures<ST>::sort_features()
00922 {
00923     if (m_subset)
00924         SG_ERROR("sort_features() not allowed with subset\n");
00925 
00926     ASSERT(get_num_preprocessors()==0);
00927 
00928     if (!sparse_feature_matrix)
00929         SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
00930 
00931     for (int32_t i=0; i<num_vectors; i++)
00932     {
00933         int32_t len=sparse_feature_matrix[i].num_feat_entries;
00934 
00935         if (!len)
00936             continue;
00937 
00938         SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
00939         int32_t* feat_idx=SG_MALLOC(int32_t, len);
00940         int32_t* orig_idx=SG_MALLOC(int32_t, len);
00941 
00942         for (int j=0; j<len; j++)
00943         {
00944             feat_idx[j]=sf_orig[j].feat_index;
00945             orig_idx[j]=j;
00946         }
00947 
00948         CMath::qsort_index(feat_idx, orig_idx, len);
00949 
00950         SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
00951         for (int j=0; j<len; j++)
00952             sf_new[j]=sf_orig[orig_idx[j]];
00953 
00954         sparse_feature_matrix[i].features=sf_new;
00955 
00956         // sanity check
00957         for (int j=0; j<len-1; j++)
00958             ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
00959 
00960         SG_FREE(orig_idx);
00961         SG_FREE(feat_idx);
00962         SG_FREE(sf_orig);
00963     }
00964 }
00965 
00966 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
00967         CLabels* label)
00968 {
00969     if (m_subset)
00970         SG_ERROR("write_svmlight_file() not allowed with subset\n");
00971 
00972     ASSERT(label);
00973     int32_t num=label->get_num_labels();
00974     ASSERT(num>0);
00975     ASSERT(num==num_vectors);
00976 
00977     FILE* f=fopen(fname, "wb");
00978 
00979     if (f)
00980     {
00981         for (int32_t i=0; i<num; i++)
00982         {
00983             fprintf(f, "%d ", (int32_t) label->get_int_label(i));
00984 
00985             SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
00986             int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
00987 
00988             for (int32_t j=0; j<num_feat; j++)
00989             {
00990                 if (j<num_feat-1)
00991                     fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00992                 else
00993                     fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00994             }
00995         }
00996 
00997         fclose(f);
00998         return true;
00999     }
01000     return false;
01001 }
01002 
01003 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
01004 {
01005     return num_features;
01006 }
01007 
01008 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
01009         CDotFeatures* df, int32_t vec_idx2)
01010 {
01011     ASSERT(df);
01012     ASSERT(df->get_feature_type() == get_feature_type());
01013     ASSERT(df->get_feature_class() == get_feature_class());
01014     CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
01015 
01016     SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
01017     SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
01018 
01019     float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
01020         bvec.features, bvec.num_feat_entries);
01021 
01022     free_sparse_feature_vector(avec, vec_idx1);
01023     sf->free_sparse_feature_vector(bvec, vec_idx2);
01024 
01025     return result;
01026 }
01027 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01028 {
01029     ASSERT(vec2);
01030     if (vec2_len!=num_features)
01031     {
01032         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01033                 vec2_len, num_features);
01034     }
01035     float64_t result=0;
01036 
01037     SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
01038 
01039     if (sv.features)
01040     {
01041         for (int32_t i=0; i<sv.num_feat_entries; i++)
01042             result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
01043     }
01044 
01045     free_sparse_feature_vector(sv, vec_idx1);
01046 
01047     return result;
01048 }
01049 
01050 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
01051 {
01052     if (vector_index>=get_num_vectors())
01053     {
01054         SG_ERROR("Index out of bounds (number of vectors %d, you "
01055                 "requested %d)\n", get_num_vectors(), vector_index);
01056     }
01057 
01058     if (!sparse_feature_matrix)
01059         SG_ERROR("Requires a in-memory feature matrix\n");
01060 
01061     sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
01062     it->sv=get_sparse_feature_vector(vector_index);
01063     it->index=0;
01064 
01065     return it;
01066 }
01067 
01068 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
01069 {
01070     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01071     if (!it || it->index>=it->sv.num_feat_entries)
01072         return false;
01073 
01074     int32_t i=it->index++;
01075 
01076     index=it->sv.features[i].feat_index;
01077     value=(float64_t) it->sv.features[i].entry;
01078 
01079     return true;
01080 }
01081 
01082 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
01083 {
01084     if (!iterator)
01085         return;
01086 
01087     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01088     free_sparse_feature_vector(it->sv, it->sv.vec_index);
01089     SG_FREE(it);
01090 }
01091 
01092 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices)
01093 {
01094     SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01095         get_dim_feature_space());
01096 
01097     for (index_t i=0; i<indices.vlen; ++i)
01098     {
01099         /* index to copy */
01100         index_t index=indices.vector[i];
01101 
01102         /* copy sparse vector TODO THINK ABOUT VECTOR INDEX (i or vec.index*/
01103         SGSparseVector<ST> current=get_sparse_feature_vector(index);
01104         matrix_copy.sparse_matrix[i]=SGSparseVector<ST>(
01105             current.num_feat_entries, current.vec_index);
01106 
01107         /* copy entries */
01108         memcpy(matrix_copy.sparse_matrix[i].features, current.features,
01109             sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries);
01110 
01111         free_sparse_feature_vector(current, index);
01112     }
01113 
01114     return new CSparseFeatures<ST>(matrix_copy);
01115 }
01116 
01117 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
01118     int32_t& len, SGSparseVectorEntry<ST>* target)
01119 {
01120     SG_NOTIMPLEMENTED;
01121 
01122     len=0;
01123     return NULL;
01124 }
01125 
01126 template<class ST> void CSparseFeatures<ST>::init()
01127 {
01128     set_generic<ST>();
01129 
01130     m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01131             "sparse_feature_matrix",
01132             "Array of sparse vectors.");
01133     m_parameters->add(&num_features, "num_features",
01134             "Total number of features.");
01135 }
01136 
01137 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type()    \
01139 {                                                                           \
01140     return f_type;                                                          \
01141 }
01142 GET_FEATURE_TYPE(bool, F_BOOL)
01143 GET_FEATURE_TYPE(char, F_CHAR)
01144 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01145 GET_FEATURE_TYPE(int8_t, F_BYTE)
01146 GET_FEATURE_TYPE(int16_t, F_SHORT)
01147 GET_FEATURE_TYPE(uint16_t, F_WORD)
01148 GET_FEATURE_TYPE(int32_t, F_INT)
01149 GET_FEATURE_TYPE(uint32_t, F_UINT)
01150 GET_FEATURE_TYPE(int64_t, F_LONG)
01151 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01152 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01153 GET_FEATURE_TYPE(float64_t, F_DREAL)
01154 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01155 #undef GET_FEATURE_TYPE
01156 
01157 #define LOAD(fname, sg_type)                                            \
01158 template<> void CSparseFeatures<sg_type>::load(CFile* loader)   \
01159 {                                                                       \
01160     remove_subset();                                                    \
01161     SG_SET_LOCALE_C;                                                    \
01162     ASSERT(loader);                                                     \
01163     SGSparseVector<sg_type>* matrix=NULL;                                       \
01164     int32_t num_feat=0;                                                 \
01165     int32_t num_vec=0;                                                  \
01166     loader->fname(matrix, num_feat, num_vec);                           \
01167     set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec));              \
01168     SG_RESET_LOCALE;                                                    \
01169 }
01170 LOAD(get_sparse_matrix, bool)
01171 LOAD(get_sparse_matrix, char)
01172 LOAD(get_sparse_matrix, uint8_t)
01173 LOAD(get_int8_sparsematrix, int8_t)
01174 LOAD(get_sparse_matrix, int16_t)
01175 LOAD(get_sparse_matrix, uint16_t)
01176 LOAD(get_sparse_matrix, int32_t)
01177 LOAD(get_uint_sparsematrix, uint32_t)
01178 LOAD(get_long_sparsematrix, int64_t)
01179 LOAD(get_ulong_sparsematrix, uint64_t)
01180 LOAD(get_sparse_matrix, float32_t)
01181 LOAD(get_sparse_matrix, float64_t)
01182 LOAD(get_longreal_sparsematrix, floatmax_t)
01183 #undef LOAD
01184 
01185 #define WRITE(fname, sg_type)                                           \
01186 template<> void CSparseFeatures<sg_type>::save(CFile* writer)   \
01187 {                                                                       \
01188     if (m_subset)                                                       \
01189         SG_ERROR("save() not allowed with subset\n");                   \
01190     SG_SET_LOCALE_C;                                                    \
01191     ASSERT(writer);                                                     \
01192     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01193     SG_RESET_LOCALE;                                                    \
01194 }
01195 WRITE(set_sparse_matrix, bool)
01196 WRITE(set_sparse_matrix, char)
01197 WRITE(set_sparse_matrix, uint8_t)
01198 WRITE(set_int8_sparsematrix, int8_t)
01199 WRITE(set_sparse_matrix, int16_t)
01200 WRITE(set_sparse_matrix, uint16_t)
01201 WRITE(set_sparse_matrix, int32_t)
01202 WRITE(set_uint_sparsematrix, uint32_t)
01203 WRITE(set_long_sparsematrix, int64_t)
01204 WRITE(set_ulong_sparsematrix, uint64_t)
01205 WRITE(set_sparse_matrix, float32_t)
01206 WRITE(set_sparse_matrix, float64_t)
01207 WRITE(set_longreal_sparsematrix, floatmax_t)
01208 #undef WRITE
01209 
01210 template class CSparseFeatures<bool>;
01211 template class CSparseFeatures<char>;
01212 template class CSparseFeatures<int8_t>;
01213 template class CSparseFeatures<uint8_t>;
01214 template class CSparseFeatures<int16_t>;
01215 template class CSparseFeatures<uint16_t>;
01216 template class CSparseFeatures<int32_t>;
01217 template class CSparseFeatures<uint32_t>;
01218 template class CSparseFeatures<int64_t>;
01219 template class CSparseFeatures<uint64_t>;
01220 template class CSparseFeatures<float32_t>;
01221 template class CSparseFeatures<float64_t>;
01222 template class CSparseFeatures<floatmax_t>;
01223 }