SparseFeatures.cpp

Go to the documentation of this file.
00001 #include <shogun/features/SparseFeatures.h>
00002 #include <shogun/preprocessor/SparsePreprocessor.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/lib/DataType.h>
00005 #include <shogun/labels/RegressionLabels.h>
00006 #include <shogun/io/SGIO.h>
00007 
00008 #include <string.h>
00009 #include <stdlib.h>
00010 
00011 namespace shogun
00012 {
00013 
00014 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
00015 : CDotFeatures(size), num_vectors(0), num_features(0),
00016     sparse_feature_matrix(NULL), feature_cache(NULL)
00017 {
00018     init();
00019 }
00020 
00021 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src,
00022         int32_t num_feat, int32_t num_vec, bool copy)
00023 : CDotFeatures(0), num_vectors(num_vec), num_features(num_feat),
00024     sparse_feature_matrix(NULL), feature_cache(NULL)
00025 {
00026     init();
00027 
00028     sparse_feature_matrix = src;
00029     //SG_MALLOC(SGSparseVector<ST>, num_vec);
00030     //for (int32_t i=0; i< num_vec; i++)
00031     //{
00032     //  new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00033     //  sparse_feature_matrix[i] = src[i];
00034     //}
00035 }
00036 
00037 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse)
00038 : CDotFeatures(0), num_vectors(0), num_features(0),
00039     sparse_feature_matrix(NULL), feature_cache(NULL)
00040 {
00041     init();
00042 
00043     set_sparse_feature_matrix(sparse);
00044 }
00045 
00046 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense)
00047 : CDotFeatures(0), num_vectors(0), num_features(0),
00048     sparse_feature_matrix(NULL), feature_cache(NULL)
00049 {
00050     init();
00051 
00052     set_full_feature_matrix(dense);
00053 }
00054 
00055 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
00056 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00057     num_features(orig.num_features),
00058     sparse_feature_matrix(orig.sparse_feature_matrix),
00059     feature_cache(orig.feature_cache)
00060 {
00061     init();
00062 
00063     if (orig.sparse_feature_matrix)
00064     {
00065         free_sparse_feature_matrix();
00066         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00067         for (int32_t i=0; i< num_vectors; i++)
00068         {
00069             new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00070             sparse_feature_matrix[i] = orig.sparse_feature_matrix[i];
00071         }
00072     }
00073 
00074     m_subset_stack=orig.m_subset_stack;
00075     SG_REF(m_subset_stack);
00076 }
00077 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
00078 : CDotFeatures(loader), num_vectors(0), num_features(0),
00079     sparse_feature_matrix(NULL), feature_cache(NULL)
00080 {
00081     init();
00082 
00083     load(loader);
00084 }
00085 
00086 template<class ST> CSparseFeatures<ST>::~CSparseFeatures()
00087 {
00088     free_sparse_features();
00089 }
00090 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix()
00091 {
00092     for (int32_t i=0; i<num_vectors; i++)
00093         (&sparse_feature_matrix[i])->~SGSparseVector();
00094 
00095     SG_FREE(sparse_feature_matrix);
00096     num_vectors=0;
00097     num_features=0;
00098     remove_all_subsets();
00099 }
00100 template<class ST> void CSparseFeatures<ST>::free_sparse_features()
00101 {
00102     free_sparse_feature_matrix();
00103     delete feature_cache;
00104     feature_cache = NULL;
00105 }
00106 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
00107 {
00108     return new CSparseFeatures<ST>(*this);
00109 }
00110 
00111 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
00112 {
00113     ASSERT(index>=0 && index<num_features);
00114     ASSERT(num>=0 && num<get_num_vectors());
00115 
00116     int32_t i;
00117     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00118     ST ret = 0 ;
00119 
00120     if (sv.features)
00121     {
00122         for (i=0; i<sv.num_feat_entries; i++)
00123             if (sv.features[i].feat_index==index)
00124                 ret+=sv.features[i].entry ;
00125     }
00126 
00127     free_sparse_feature_vector(num);
00128 
00129     return ret ;
00130 }
00131 
00132 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
00133 {
00134     int32_t i;
00135     len=0;
00136     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00137     ST* fv=NULL;
00138 
00139     if (sv.features)
00140     {
00141         len=num_features;
00142         fv=SG_MALLOC(ST, num_features);
00143 
00144         for (i=0; i<num_features; i++)
00145             fv[i]=0;
00146 
00147         for (i=0; i<sv.num_feat_entries; i++)
00148             fv[sv.features[i].feat_index]= sv.features[i].entry;
00149     }
00150 
00151     free_sparse_feature_vector(num);
00152 
00153     return fv;
00154 }
00155 
00156 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num)
00157 {
00158     if (num>=get_num_vectors())
00159     {
00160         SG_ERROR("Index out of bounds (number of vectors %d, you "
00161                 "requested %d)\n", get_num_vectors(), num);
00162     }
00163 
00164     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00165 
00166     SGVector<ST> dense;
00167 
00168     if (sv.features)
00169     {
00170         dense=SGVector<ST>(num_features);
00171         dense.zero();
00172 
00173         for (int32_t i=0; i<sv.num_feat_entries; i++)
00174             dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00175     }
00176 
00177     free_sparse_feature_vector(num);
00178 
00179     return dense;
00180 }
00181 
00182 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00183 {
00184     SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00185     int32_t len=sv.num_feat_entries;
00186     free_sparse_feature_vector(num);
00187     return len;
00188 }
00189 
00190 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num)
00191 {
00192     ASSERT(num<get_num_vectors());
00193 
00194     index_t real_num=m_subset_stack->subset_idx_conversion(num);
00195 
00196     SGSparseVector<ST> result;
00197 
00198     if (sparse_feature_matrix)
00199     {
00200         return sparse_feature_matrix[real_num];
00201     }
00202     else
00203     {
00204         if (feature_cache)
00205         {
00206             result.features=feature_cache->lock_entry(num);
00207 
00208             if (result.features)
00209                 return result;
00210             else
00211             {
00212                 result.features=feature_cache->set_entry(num);
00213             }
00214         }
00215 
00216         //if (!result.features)
00217         //  result.do_free=true;
00218 
00219         result.features=compute_sparse_feature_vector(num,
00220             result.num_feat_entries, result.features);
00221 
00222 
00223         if (get_num_preprocessors())
00224         {
00225             int32_t tmp_len=result.num_feat_entries;
00226             SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00227             SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00228 
00229             for (int32_t i=0; i<get_num_preprocessors(); i++)
00230             {
00231                 //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00232 
00233                 if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00234                     SG_FREE(tmp_feat_before);
00235                 tmp_feat_before=tmp_feat_after;
00236             }
00237 
00238             memcpy(result.features, tmp_feat_after,
00239                     sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00240 
00241             SG_FREE(tmp_feat_after);
00242             result.num_feat_entries=tmp_len ;
00243             SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00244         }
00245         return result ;
00246     }
00247 }
00248 
00249 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00250 {
00251     ASSERT(vec);
00252     ASSERT(dim==num_features);
00253     ST result=b;
00254 
00255     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00256 
00257     if (sv.features)
00258     {
00259         for (int32_t i=0; i<sv.num_feat_entries; i++)
00260         {
00261             result+=alpha*vec[sv.features[i].feat_index]
00262                 *sv.features[i].entry;
00263         }
00264     }
00265 
00266     free_sparse_feature_vector(num);
00267     return result;
00268 }
00269 
00270 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
00271 {
00272     ASSERT(vec);
00273     if (dim!=num_features)
00274     {
00275         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00276                 dim, num_features);
00277     }
00278 
00279     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00280 
00281     if (sv.features)
00282     {
00283         if (abs_val)
00284         {
00285             for (int32_t i=0; i<sv.num_feat_entries; i++)
00286             {
00287                 vec[sv.features[i].feat_index]+=alpha
00288                     *CMath::abs(sv.features[i].entry);
00289             }
00290         }
00291         else
00292         {
00293             for (int32_t i=0; i<sv.num_feat_entries; i++)
00294             {
00295                 vec[sv.features[i].feat_index]+=alpha
00296                         *sv.features[i].entry;
00297             }
00298         }
00299     }
00300 
00301     free_sparse_feature_vector(num);
00302 }
00303 
00304 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
00305 {
00306     if (feature_cache)
00307         feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00308 
00309     //vec.free_vector();
00310 }
00311 
00312 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00313 {
00314     if (m_subset_stack->has_subsets())
00315         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00316 
00317     num_feat=num_features;
00318     num_vec=num_vectors;
00319 
00320     return sparse_feature_matrix;
00321 }
00322 
00323 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix()
00324 {
00325     if (m_subset_stack->has_subsets())
00326         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00327 
00328     SGSparseMatrix<ST> sm=SGSparseMatrix<ST>(NULL, 0, 0, false);
00329     sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00330     return sm;
00331 }
00332 
00333 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed()
00334 {
00335     int32_t num_feat;
00336     int32_t num_vec;
00337     SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00338     //SG_PRINT("num_feat = %d , num_vec = %d \n", num_feat, num_vec);
00339     return new CSparseFeatures<ST>(s, num_feat, num_vec);
00340 }
00341 
00342 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00343 {
00344     num_feat=get_num_vectors();
00345     num_vec=num_features;
00346     //SG_PRINT("get transposed num_feat = %d , num_vec = %d \n", num_feat, num_vec);
00347 
00348     int32_t* hist=SG_MALLOC(int32_t, num_features);
00349     memset(hist, 0, sizeof(int32_t)*num_features);
00350 
00351     // count how lengths of future feature vectors
00352     for (int32_t v=0; v<num_feat; v++)
00353     {
00354         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00355 
00356         for (int32_t i=0; i<sv.num_feat_entries; i++)
00357             hist[sv.features[i].feat_index]++;
00358     }
00359 
00360     // allocate room for future feature vectors
00361     SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00362     for (int32_t v=0; v<num_vec; v++)
00363         new (&sfm[v]) SGSparseVector<ST>(hist[v]);
00364 
00365     // fill future feature vectors with content
00366     memset(hist,0,sizeof(int32_t)*num_features);
00367     for (int32_t v=0; v<num_feat; v++)
00368     {
00369         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00370 
00371         for (int32_t i=0; i<sv.num_feat_entries; i++)
00372         {
00373             int32_t vidx=sv.features[i].feat_index;
00374             int32_t fidx=v;
00375             sfm[vidx].features[hist[vidx]].feat_index=fidx;
00376             sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00377             hist[vidx]++;
00378         }
00379 
00380     }
00381 
00382     SG_FREE(hist);
00383     return sfm;
00384 }
00385 
00386 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00387 {
00388     if (m_subset_stack->has_subsets())
00389         SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00390 
00391     SGSparseVector<ST>* sparse_matrix = SG_MALLOC(SGSparseVector<ST>, sm.num_vectors);
00392     for (int32_t i=0; i<sm.num_vectors; i++)
00393     {
00394         new (&sparse_matrix[i]) SGSparseVector<ST>();
00395         sparse_matrix[i] = sm[i];
00396     }
00397 
00398     sparse_feature_matrix=sparse_matrix;
00399     num_features=sm.num_features;
00400     num_vectors=sm.num_vectors;
00401 }
00402 
00403 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix()
00404 {
00405     SGMatrix<ST> full(num_features, get_num_vectors());
00406     full.zero();
00407 
00408     SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00409 
00410     for (int32_t v=0; v<full.num_cols; v++)
00411     {
00412         int32_t idx=m_subset_stack->subset_idx_conversion(v);
00413         SGSparseVector<ST> current=sparse_feature_matrix[idx];
00414 
00415         for (int32_t f=0; f<current.num_feat_entries; f++)
00416         {
00417             int64_t offs=(idx*num_features)
00418                     +current.features[f].feat_index;
00419 
00420             full.matrix[offs]=current.features[f].entry;
00421         }
00422     }
00423 
00424     return full;
00425 }
00426 
00427 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full)
00428 {
00429     remove_all_subsets();
00430 
00431     ST* src=full.matrix;
00432     int32_t num_feat=full.num_rows;
00433     int32_t num_vec=full.num_cols;
00434 
00435     free_sparse_feature_matrix();
00436     bool result=true;
00437     num_features=num_feat;
00438     num_vectors=num_vec;
00439 
00440     SG_INFO("converting dense feature matrix to sparse one\n");
00441     int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00442 
00443     if (num_feat_entries)
00444     {
00445         int64_t num_total_entries=0;
00446 
00447         // count nr of non sparse features
00448         for (int32_t i=0; i< num_vec; i++)
00449         {
00450             num_feat_entries[i]=0;
00451             for (int32_t j=0; j< num_feat; j++)
00452             {
00453                 if (src[i*((int64_t) num_feat) + j] != 0)
00454                     num_feat_entries[i]++;
00455             }
00456         }
00457 
00458         if (num_vec>0)
00459         {
00460             sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00461 
00462             if (sparse_feature_matrix)
00463             {
00464                 for (int32_t i=0; i< num_vec; i++)
00465                 {
00466                     new(&sparse_feature_matrix[i]) SGSparseVector<ST>();
00467                     sparse_feature_matrix[i] = SGSparseVector<ST>(num_feat_entries[i]);
00468                     int32_t sparse_feat_idx=0;
00469 
00470                     for (int32_t j=0; j< num_feat; j++)
00471                     {
00472                         int64_t pos= i*num_feat + j;
00473 
00474                         if (src[pos] != 0)
00475                         {
00476                             sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00477                             sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00478                             sparse_feat_idx++;
00479                             num_total_entries++;
00480                         }
00481                     }
00482                 }
00483             }
00484             else
00485             {
00486                 SG_ERROR( "allocation of sparse feature matrix failed\n");
00487                 result=false;
00488             }
00489 
00490             SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00491                     num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00492         }
00493         else
00494         {
00495             SG_ERROR( "huh ? zero size matrix given ?\n");
00496             result=false;
00497         }
00498     }
00499     SG_FREE(num_feat_entries);
00500     return result;
00501 }
00502 
00503 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
00504 {
00505     SG_INFO( "force: %d\n", force_preprocessing);
00506 
00507     if ( sparse_feature_matrix && get_num_preprocessors() )
00508     {
00509         for (int32_t i=0; i<get_num_preprocessors(); i++)
00510         {
00511             if ( (!is_preprocessed(i) || force_preprocessing) )
00512             {
00513                 set_preprocessed(i);
00514                 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00515                 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00516                     return false;
00517             }
00518             return true;
00519         }
00520         return true;
00521     }
00522     else
00523     {
00524         SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00525         return false;
00526     }
00527 }
00528 
00529 template<class ST> int32_t CSparseFeatures<ST>::get_size() const
00530 {
00531     return sizeof(ST);
00532 }
00533 
00534 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CDenseFeatures<ST>* sf)
00535 {
00536     SGMatrix<ST> fm=sf->get_feature_matrix();
00537     ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00538 
00539     return set_full_feature_matrix(fm);
00540 }
00541 
00542 template<class ST> int32_t  CSparseFeatures<ST>::get_num_vectors() const
00543 {
00544     return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00545 }
00546 
00547 template<class ST> int32_t  CSparseFeatures<ST>::get_num_features()
00548 {
00549     return num_features;
00550 }
00551 
00552 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
00553 {
00554     int32_t n=num_features;
00555     ASSERT(n<=num);
00556     num_features=num;
00557     return num_features;
00558 }
00559 
00560 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class() const
00561 {
00562     return C_SPARSE;
00563 }
00564 
00565 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
00566 {
00567     if (feature_cache)
00568         feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00569 
00570     //vec.free_vector();
00571 }
00572 
00573 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries()
00574 {
00575     int64_t num=0;
00576     index_t num_vec=get_num_vectors();
00577     for (int32_t i=0; i<num_vec; i++)
00578         num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
00579 
00580     return num;
00581 }
00582 
00583 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq)
00584 {
00585     ASSERT(sq);
00586 
00587     index_t num_vec=get_num_vectors();
00588     for (int32_t i=0; i<num_vec; i++)
00589     {
00590         sq[i]=0;
00591         SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00592 
00593         for (int32_t j=0; j<vec.num_feat_entries; j++)
00594             sq[i]+=vec.features[j].entry*vec.features[j].entry;
00595 
00596         free_feature_vector(i);
00597     }
00598 
00599     return sq;
00600 }
00601 
00602 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm(
00603         CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
00604         CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00605 {
00606     int32_t i,j;
00607     ASSERT(lhs);
00608     ASSERT(rhs);
00609 
00610     SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
00611     SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
00612     ASSERT(avec.features);
00613     ASSERT(bvec.features);
00614 
00615     float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00616 
00617     if (avec.num_feat_entries<=bvec.num_feat_entries)
00618     {
00619         j=0;
00620         for (i=0; i<avec.num_feat_entries; i++)
00621         {
00622             int32_t a_feat_idx=avec.features[i].feat_index;
00623 
00624             while ((j<bvec.num_feat_entries)
00625                     &&(bvec.features[j].feat_index<a_feat_idx))
00626                 j++;
00627 
00628             if ((j<bvec.num_feat_entries)
00629                     &&(bvec.features[j].feat_index==a_feat_idx))
00630             {
00631                 result-=2*(avec.features[i].entry*bvec.features[j].entry);
00632                 j++;
00633             }
00634         }
00635     }
00636     else
00637     {
00638         j=0;
00639         for (i=0; i<bvec.num_feat_entries; i++)
00640         {
00641             int32_t b_feat_idx=bvec.features[i].feat_index;
00642 
00643             while ((j<avec.num_feat_entries)
00644                     &&(avec.features[j].feat_index<b_feat_idx))
00645                 j++;
00646 
00647             if ((j<avec.num_feat_entries)
00648                     &&(avec.features[j].feat_index==b_feat_idx))
00649             {
00650                 result-=2*(bvec.features[i].entry*avec.features[j].entry);
00651                 j++;
00652             }
00653         }
00654     }
00655 
00656     ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
00657     ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
00658 
00659     return CMath::abs(result);
00660 }
00661 
00662 template<class ST> CRegressionLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
00663         bool do_sort_features)
00664 {
00665     remove_all_subsets();
00666 
00667     CRegressionLabels* lab=NULL;
00668 
00669     size_t blocksize=1024*1024;
00670     size_t required_blocksize=blocksize;
00671     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00672     FILE* f=fopen(fname, "ro");
00673 
00674     if (f)
00675     {
00676         free_sparse_feature_matrix();
00677         num_vectors=0;
00678         num_features=0;
00679 
00680         SG_INFO("counting line numbers in file %s\n", fname);
00681         size_t sz=blocksize;
00682         size_t block_offs=0;
00683         size_t old_block_offs=0;
00684         fseek(f, 0, SEEK_END);
00685         size_t fsize=ftell(f);
00686         rewind(f);
00687 
00688         while (sz == blocksize)
00689         {
00690             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00691             for (size_t i=0; i<sz; i++)
00692             {
00693                 block_offs++;
00694                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00695                 {
00696                     num_vectors++;
00697                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00698                     old_block_offs=block_offs;
00699                 }
00700             }
00701             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00702         }
00703 
00704         SG_INFO("found %d feature vectors\n", num_vectors);
00705         SG_FREE(dummy);
00706         blocksize=required_blocksize;
00707         dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
00708 
00709         lab=new CRegressionLabels(num_vectors);
00710         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00711         for (int32_t i=0; i<num_vectors; i++)
00712             new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00713         rewind(f);
00714         sz=blocksize;
00715         int32_t lines=0;
00716         while (sz == blocksize)
00717         {
00718             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00719 
00720             size_t old_sz=0;
00721             for (size_t i=0; i<sz; i++)
00722             {
00723                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00724                 {
00725                     size_t len=i-old_sz+1;
00726                     uint8_t* data=&dummy[old_sz];
00727 
00728                     for (size_t j=0; j<len; j++)
00729                         dummy[j]=data[j];
00730 
00731                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00732                     i=0;
00733                     old_sz=0;
00734                     sz+=len;
00735                 }
00736 
00737                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00738                 {
00739 
00740                     size_t len=i-old_sz;
00741                     uint8_t* data=&dummy[old_sz];
00742 
00743                     int32_t dims=0;
00744                     for (size_t j=0; j<len; j++)
00745                     {
00746                         if (data[j]==':')
00747                             dims++;
00748                     }
00749 
00750                     if (dims<=0)
00751                     {
00752                         SG_ERROR("Error in line %d - number of"
00753                                 " dimensions is %d line is %d characters"
00754                                 " long\n line_content:'%.*s'\n", lines,
00755                                 dims, len, len, (const char*) data);
00756                     }
00757 
00758                     SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
00759                     size_t j=0;
00760                     for (; j<len; j++)
00761                     {
00762                         if (data[j]==' ')
00763                         {
00764                             data[j]='\0';
00765 
00766                             lab->set_label(lines, atof((const char*) data));
00767                             break;
00768                         }
00769                     }
00770 
00771                     int32_t d=0;
00772                     j++;
00773                     uint8_t* start=&data[j];
00774                     for (; j<len; j++)
00775                     {
00776                         if (data[j]==':')
00777                         {
00778                             data[j]='\0';
00779 
00780                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00781                             num_features=CMath::max(num_features, feat[d].feat_index+1);
00782 
00783                             j++;
00784                             start=&data[j];
00785                             for (; j<len; j++)
00786                             {
00787                                 if (data[j]==' ' || data[j]=='\n')
00788                                 {
00789                                     data[j]='\0';
00790                                     feat[d].entry=(ST) atof((const char*) start);
00791                                     d++;
00792                                     break;
00793                                 }
00794                             }
00795 
00796                             if (j==len)
00797                             {
00798                                 data[j]='\0';
00799                                 feat[dims-1].entry=(ST) atof((const char*) start);
00800                             }
00801 
00802                             j++;
00803                             start=&data[j];
00804                         }
00805                     }
00806 
00807                     sparse_feature_matrix[lines].num_feat_entries=dims;
00808                     sparse_feature_matrix[lines].features=feat;
00809 
00810                     old_sz=i+1;
00811                     lines++;
00812                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00813                 }
00814             }
00815         }
00816         SG_INFO("file successfully read\n");
00817         fclose(f);
00818     }
00819 
00820     SG_FREE(dummy);
00821 
00822     if (do_sort_features)
00823         sort_features();
00824 
00825     return lab;
00826 }
00827 
00828 template<class ST> void CSparseFeatures<ST>::sort_features()
00829 {
00830     if (m_subset_stack->has_subsets())
00831         SG_ERROR("sort_features() not allowed with subset\n");
00832 
00833     ASSERT(get_num_preprocessors()==0);
00834 
00835     if (!sparse_feature_matrix)
00836         SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
00837 
00838     for (int32_t i=0; i<num_vectors; i++)
00839     {
00840         int32_t len=sparse_feature_matrix[i].num_feat_entries;
00841 
00842         if (!len)
00843             continue;
00844 
00845         SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
00846         int32_t* feat_idx=SG_MALLOC(int32_t, len);
00847         int32_t* orig_idx=SG_MALLOC(int32_t, len);
00848 
00849         for (int j=0; j<len; j++)
00850         {
00851             feat_idx[j]=sf_orig[j].feat_index;
00852             orig_idx[j]=j;
00853         }
00854 
00855         CMath::qsort_index(feat_idx, orig_idx, len);
00856 
00857         SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
00858         for (int j=0; j<len; j++)
00859             sf_new[j]=sf_orig[orig_idx[j]];
00860 
00861         sparse_feature_matrix[i].features=sf_new;
00862 
00863         // sanity check
00864         for (int j=0; j<len-1; j++)
00865             ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
00866 
00867         SG_FREE(orig_idx);
00868         SG_FREE(feat_idx);
00869         SG_FREE(sf_orig);
00870     }
00871 }
00872 
00873 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
00874         CRegressionLabels* label)
00875 {
00876     if (m_subset_stack->has_subsets())
00877         SG_ERROR("write_svmlight_file() not allowed with subset\n");
00878 
00879     ASSERT(label);
00880     int32_t num=label->get_num_labels();
00881     ASSERT(num>0);
00882     ASSERT(num==num_vectors);
00883 
00884     FILE* f=fopen(fname, "wb");
00885 
00886     if (f)
00887     {
00888         for (int32_t i=0; i<num; i++)
00889         {
00890             fprintf(f, "%d ", (int32_t) label->get_int_label(i));
00891 
00892             SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
00893             int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
00894 
00895             for (int32_t j=0; j<num_feat; j++)
00896             {
00897                 if (j<num_feat-1)
00898                     fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00899                 else
00900                     fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00901             }
00902         }
00903 
00904         fclose(f);
00905         return true;
00906     }
00907     return false;
00908 }
00909 
00910 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
00911 {
00912     return num_features;
00913 }
00914 
00915 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
00916         CDotFeatures* df, int32_t vec_idx2)
00917 {
00918     ASSERT(df);
00919     ASSERT(df->get_feature_type() == get_feature_type());
00920     ASSERT(df->get_feature_class() == get_feature_class());
00921     CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
00922 
00923     SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
00924     SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
00925 
00926     float64_t result = SGSparseVector<ST>::sparse_dot(avec, bvec);
00927     free_sparse_feature_vector(vec_idx1);
00928     sf->free_sparse_feature_vector(vec_idx2);
00929 
00930     return result;
00931 }
00932 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00933 {
00934     ASSERT(vec2);
00935     if (vec2_len!=num_features)
00936     {
00937         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00938                 vec2_len, num_features);
00939     }
00940     float64_t result=0;
00941 
00942     SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
00943 
00944     if (sv.features)
00945     {
00946         for (int32_t i=0; i<sv.num_feat_entries; i++)
00947             result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
00948     }
00949 
00950     free_sparse_feature_vector(vec_idx1);
00951 
00952     return result;
00953 }
00954 
00955 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
00956 {
00957     if (vector_index>=get_num_vectors())
00958     {
00959         SG_ERROR("Index out of bounds (number of vectors %d, you "
00960                 "requested %d)\n", get_num_vectors(), vector_index);
00961     }
00962 
00963     if (!sparse_feature_matrix)
00964         SG_ERROR("Requires a in-memory feature matrix\n");
00965 
00966     sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
00967     it->sv=get_sparse_feature_vector(vector_index);
00968     it->index=0;
00969     it->vector_index=vector_index;
00970 
00971     return it;
00972 }
00973 
00974 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00975 {
00976     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
00977     if (!it || it->index>=it->sv.num_feat_entries)
00978         return false;
00979 
00980     int32_t i=it->index++;
00981 
00982     index=it->sv.features[i].feat_index;
00983     value=(float64_t) it->sv.features[i].entry;
00984 
00985     return true;
00986 }
00987 
00988 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
00989 {
00990     if (!iterator)
00991         return;
00992 
00993     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
00994     free_sparse_feature_vector(it->vector_index);
00995     SG_FREE(it);
00996 }
00997 
00998 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices)
00999 {
01000     SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01001         get_dim_feature_space());
01002 
01003     for (index_t i=0; i<indices.vlen; ++i)
01004     {
01005         /* index to copy */
01006         index_t index=indices.vector[i];
01007         index_t real_index=m_subset_stack->subset_idx_conversion(index);
01008 
01009         /* copy sparse vector */
01010         SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
01011         matrix_copy.sparse_matrix[i]=current;
01012 
01013         free_sparse_feature_vector(index);
01014     }
01015 
01016     CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
01017     SG_REF(result);
01018     return result;
01019 }
01020 
01021 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
01022     int32_t& len, SGSparseVectorEntry<ST>* target)
01023 {
01024     SG_NOTIMPLEMENTED;
01025 
01026     len=0;
01027     return NULL;
01028 }
01029 
01030 template<class ST> void CSparseFeatures<ST>::init()
01031 {
01032     set_generic<ST>();
01033 
01034     m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01035             "sparse_feature_matrix",
01036             "Array of sparse vectors.");
01037     m_parameters->add(&num_features, "num_features",
01038             "Total number of features.");
01039 }
01040 
01041 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01042 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const  \
01043 {                                                                           \
01044     return f_type;                                                          \
01045 }
01046 GET_FEATURE_TYPE(bool, F_BOOL)
01047 GET_FEATURE_TYPE(char, F_CHAR)
01048 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01049 GET_FEATURE_TYPE(int8_t, F_BYTE)
01050 GET_FEATURE_TYPE(int16_t, F_SHORT)
01051 GET_FEATURE_TYPE(uint16_t, F_WORD)
01052 GET_FEATURE_TYPE(int32_t, F_INT)
01053 GET_FEATURE_TYPE(uint32_t, F_UINT)
01054 GET_FEATURE_TYPE(int64_t, F_LONG)
01055 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01056 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01057 GET_FEATURE_TYPE(float64_t, F_DREAL)
01058 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01059 #undef GET_FEATURE_TYPE
01060 
01061 #define LOAD(fname, sg_type)                                            \
01062 template<> void CSparseFeatures<sg_type>::load(CFile* loader)   \
01063 {                                                                       \
01064     remove_all_subsets();                                                   \
01065     SG_SET_LOCALE_C;                                                    \
01066     ASSERT(loader);                                                     \
01067     SGSparseVector<sg_type>* matrix=NULL;                                       \
01068     int32_t num_feat=0;                                                 \
01069     int32_t num_vec=0;                                                  \
01070     loader->fname(matrix, num_feat, num_vec);                           \
01071     set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec));              \
01072     SG_RESET_LOCALE;                                                    \
01073 }
01074 LOAD(get_sparse_matrix, bool)
01075 LOAD(get_sparse_matrix, char)
01076 LOAD(get_sparse_matrix, uint8_t)
01077 LOAD(get_int8_sparsematrix, int8_t)
01078 LOAD(get_sparse_matrix, int16_t)
01079 LOAD(get_sparse_matrix, uint16_t)
01080 LOAD(get_sparse_matrix, int32_t)
01081 LOAD(get_uint_sparsematrix, uint32_t)
01082 LOAD(get_long_sparsematrix, int64_t)
01083 LOAD(get_ulong_sparsematrix, uint64_t)
01084 LOAD(get_sparse_matrix, float32_t)
01085 LOAD(get_sparse_matrix, float64_t)
01086 LOAD(get_longreal_sparsematrix, floatmax_t)
01087 #undef LOAD
01088 
01089 #define WRITE(fname, sg_type)                                           \
01090 template<> void CSparseFeatures<sg_type>::save(CFile* writer)   \
01091 {                                                                       \
01092     if (m_subset_stack->has_subsets())                                                      \
01093         SG_ERROR("save() not allowed with subset\n");                   \
01094     SG_SET_LOCALE_C;                                                    \
01095     ASSERT(writer);                                                     \
01096     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01097     SG_RESET_LOCALE;                                                    \
01098 }
01099 WRITE(set_sparse_matrix, bool)
01100 WRITE(set_sparse_matrix, char)
01101 WRITE(set_sparse_matrix, uint8_t)
01102 WRITE(set_int8_sparsematrix, int8_t)
01103 WRITE(set_sparse_matrix, int16_t)
01104 WRITE(set_sparse_matrix, uint16_t)
01105 WRITE(set_sparse_matrix, int32_t)
01106 WRITE(set_uint_sparsematrix, uint32_t)
01107 WRITE(set_long_sparsematrix, int64_t)
01108 WRITE(set_ulong_sparsematrix, uint64_t)
01109 WRITE(set_sparse_matrix, float32_t)
01110 WRITE(set_sparse_matrix, float64_t)
01111 WRITE(set_longreal_sparsematrix, floatmax_t)
01112 #undef WRITE
01113 
01114 template class CSparseFeatures<bool>;
01115 template class CSparseFeatures<char>;
01116 template class CSparseFeatures<int8_t>;
01117 template class CSparseFeatures<uint8_t>;
01118 template class CSparseFeatures<int16_t>;
01119 template class CSparseFeatures<uint16_t>;
01120 template class CSparseFeatures<int32_t>;
01121 template class CSparseFeatures<uint32_t>;
01122 template class CSparseFeatures<int64_t>;
01123 template class CSparseFeatures<uint64_t>;
01124 template class CSparseFeatures<float32_t>;
01125 template class CSparseFeatures<float64_t>;
01126 template class CSparseFeatures<floatmax_t>;
01127 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation