SHOGUN: StreamingSparseFeatures.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Shashwat Lal Das
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 #include <shogun/features/streaming/StreamingSparseFeatures.h>
00011 namespace shogun
00012 {
00013 
00014 template <class T>
00015 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures()
00016 {
00017     set_read_functions();
00018     init();
00019 }
00020 
00021 template <class T>
00022 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file,
00023              bool is_labelled,
00024              int32_t size)
00025     : CStreamingDotFeatures()
00026 {
00027     set_read_functions();
00028     init(file, is_labelled, size);
00029 }
00030 
00031 template <class T>
00032 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures()
00033 {
00034     parser.end_parser();
00035 }
00036 
00037 template <class T>
00038 T CStreamingSparseFeatures<T>::get_feature(int32_t index)
00039 {
00040     ASSERT(index>=0 && index<current_num_features);
00041 
00042     T ret=0;
00043 
00044     if (current_vector)
00045     {
00046         for (int32_t i=0; i<current_length; i++)
00047             if (current_vector[i].feat_index==index)
00048                 ret += current_vector[i].entry;
00049     }
00050 
00051     return ret;
00052 }
00053 
00054 template <class T>
00055 void CStreamingSparseFeatures<T>::reset_stream()
00056 {
00057 }
00058 
00059 template <class T>
00060 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num)
00061 {
00062     int32_t n=current_num_features;
00063     ASSERT(n<=num);
00064     current_num_features=num;
00065     return n;
00066 }
00067 
00068 template <class T>
00069 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len)
00070 {
00071     int32_t dim = get_dim_feature_space();
00072     if (dim > len)
00073     {
00074         vec = SG_REALLOC(float32_t, vec, dim);
00075         memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00076         len = dim;
00077     }
00078 }
00079 
00080 template <class T>
00081 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len)
00082 {
00083     int32_t dim = get_dim_feature_space();
00084     if (dim > len)
00085     {
00086         vec = SG_REALLOC(float64_t, vec, dim);
00087         memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00088         len = dim;
00089     }
00090 }
00091 
00092 template <class T>
00093 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00094 {
00095     T result=0;
00096 
00097     //result remains zero when one of the vectors is non existent
00098     if (avec && bvec)
00099     {
00100         if (alen<=blen)
00101         {
00102             int32_t j=0;
00103             for (int32_t i=0; i<alen; i++)
00104             {
00105                 int32_t a_feat_idx=avec[i].feat_index;
00106 
00107                 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00108                     j++;
00109 
00110                 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00111                 {
00112                     result+= avec[i].entry * bvec[j].entry;
00113                     j++;
00114                 }
00115             }
00116         }
00117         else
00118         {
00119             int32_t j=0;
00120             for (int32_t i=0; i<blen; i++)
00121             {
00122                 int32_t b_feat_idx=bvec[i].feat_index;
00123 
00124                 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00125                     j++;
00126 
00127                 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00128                 {
00129                     result+= bvec[i].entry * avec[j].entry;
00130                     j++;
00131                 }
00132             }
00133         }
00134 
00135         result*=alpha;
00136     }
00137 
00138     return result;
00139 }
00140 
00141 template <class T>
00142 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00143 {
00144     ASSERT(vec);
00145     ASSERT(dim>=current_num_features);
00146     T result=b;
00147 
00148     int32_t num_feat=current_length;
00149     SGSparseVectorEntry<T>* sv=current_vector;
00150 
00151     if (sv)
00152     {
00153         for (int32_t i=0; i<num_feat; i++)
00154             result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00155     }
00156 
00157     return result;
00158 }
00159 
00160 template <class T>
00161 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len)
00162 {
00163     ASSERT(vec2);
00164     if (vec2_len < current_num_features)
00165     {
00166         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00167              vec2_len, current_num_features);
00168     }
00169 
00170     float64_t result=0;
00171     if (current_vector)
00172     {
00173         for (int32_t i=0; i<current_length; i++)
00174             result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00175     }
00176 
00177     return result;
00178 }
00179 
00180 template <class T>
00181 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len)
00182 {
00183     ASSERT(vec2);
00184     if (vec2_len < current_num_features)
00185     {
00186         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00187              vec2_len, current_num_features);
00188     }
00189 
00190     float32_t result=0;
00191     if (current_vector)
00192     {
00193         for (int32_t i=0; i<current_length; i++)
00194             result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00195     }
00196 
00197     return result;
00198 }
00199 
00200 template <class T>
00201 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
00202 {
00203     ASSERT(vec2);
00204     if (vec2_len < current_num_features)
00205     {
00206         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00207              vec2_len, current_num_features);
00208     }
00209 
00210     SGSparseVectorEntry<T>* sv=current_vector;
00211     int32_t num_feat=current_length;
00212 
00213     if (sv)
00214     {
00215         if (abs_val)
00216         {
00217             for (int32_t i=0; i<num_feat; i++)
00218                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00219         }
00220         else
00221         {
00222             for (int32_t i=0; i<num_feat; i++)
00223                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00224         }
00225     }
00226 }
00227 
00228 template <class T>
00229 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00230 {
00231     ASSERT(vec2);
00232     if (vec2_len < current_num_features)
00233     {
00234         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00235              vec2_len, current_num_features);
00236     }
00237 
00238     SGSparseVectorEntry<T>* sv=current_vector;
00239     int32_t num_feat=current_length;
00240 
00241     if (sv)
00242     {
00243         if (abs_val)
00244         {
00245             for (int32_t i=0; i<num_feat; i++)
00246                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00247         }
00248         else
00249         {
00250             for (int32_t i=0; i<num_feat; i++)
00251                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00252         }
00253     }
00254 }
00255 
00256 template <class T>
00257 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries()
00258 {
00259     return current_length;
00260 }
00261 
00262 template <class T>
00263 float32_t CStreamingSparseFeatures<T>::compute_squared()
00264 {
00265     ASSERT(current_vector);
00266 
00267     float32_t sq=0;
00268 
00269     for (int32_t i=0; i<current_length; i++)
00270         sq += current_vector[i].entry * current_vector[i].entry;
00271 
00272     return sq;
00273 }
00274 
00275 template <class T>
00276 void CStreamingSparseFeatures<T>::sort_features()
00277 {
00278     ASSERT(current_vector);
00279 
00280     SGSparseVectorEntry<T>* sf_orig=current_vector;
00281     int32_t len=current_length;
00282 
00283     int32_t* feat_idx=SG_MALLOC(int32_t, len);
00284     int32_t* orig_idx=SG_MALLOC(int32_t, len);
00285 
00286     for (int32_t i=0; i<len; i++)
00287     {
00288         feat_idx[i]=sf_orig[i].feat_index;
00289         orig_idx[i]=i;
00290     }
00291 
00292     CMath::qsort_index(feat_idx, orig_idx, len);
00293 
00294     SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00295 
00296     for (int32_t i=0; i<len; i++)
00297         sf_new[i]=sf_orig[orig_idx[i]];
00298 
00299     // sanity check
00300     for (int32_t i=0; i<len-1; i++)
00301         ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00302 
00303     // Copy new vector back to original
00304     for (int32_t i=0; i<len; i++)
00305         sf_orig[i]=sf_new[i];
00306 
00307     SG_FREE(orig_idx);
00308     SG_FREE(feat_idx);
00309     SG_FREE(sf_new);
00310 }
00311 
00312 template <class T>
00313 CFeatures* CStreamingSparseFeatures<T>::duplicate() const
00314 {
00315     return new CStreamingSparseFeatures<T>(*this);
00316 }
00317 
00318 template <class T>
00319 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const
00320 {
00321     if (current_vector)
00322         return 1;
00323     return 0;
00324 }
00325 
00326 template <class T>
00327 int32_t CStreamingSparseFeatures<T>::get_size() const
00328 {
00329     return sizeof(T);
00330 }
00331 
00332 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00333 {
00334     parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00335 }
00336 
00337 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00338 {
00339     parser.set_read_vector_and_label
00340         (&CStreamingFile::get_sparse_vector_and_label);
00341 }
00342 
00343 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00344 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
00345 {                                   \
00346     return f_type;                          \
00347 }
00348 
00349 GET_FEATURE_TYPE(F_BOOL, bool)
00350 GET_FEATURE_TYPE(F_CHAR, char)
00351 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00352 GET_FEATURE_TYPE(F_BYTE, int8_t)
00353 GET_FEATURE_TYPE(F_SHORT, int16_t)
00354 GET_FEATURE_TYPE(F_WORD, uint16_t)
00355 GET_FEATURE_TYPE(F_INT, int32_t)
00356 GET_FEATURE_TYPE(F_UINT, uint32_t)
00357 GET_FEATURE_TYPE(F_LONG, int64_t)
00358 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00359 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00360 GET_FEATURE_TYPE(F_DREAL, float64_t)
00361 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00362 #undef GET_FEATURE_TYPE
00363 
00364 
00365 template <class T>
00366 void CStreamingSparseFeatures<T>::init()
00367 {
00368     working_file=NULL;
00369     current_vector=NULL;
00370     current_length=-1;
00371     current_vec_index=0;
00372     current_num_features=-1;
00373 }
00374 
00375 template <class T>
00376 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00377                     bool is_labelled,
00378                     int32_t size)
00379 {
00380     init();
00381     has_labels = is_labelled;
00382     working_file = file;
00383     SG_REF(working_file);
00384     parser.init(file, is_labelled, size);
00385 }
00386 
00387 template <class T>
00388 void CStreamingSparseFeatures<T>::start_parser()
00389 {
00390     if (!parser.is_running())
00391         parser.start_parser();
00392 }
00393 
00394 template <class T>
00395 void CStreamingSparseFeatures<T>::end_parser()
00396 {
00397     parser.end_parser();
00398 }
00399 
00400 template <class T>
00401 bool CStreamingSparseFeatures<T>::get_next_example()
00402 {
00403     bool ret_value;
00404     ret_value = (bool) parser.get_next_example(current_vector,
00405                            current_length,
00406                            current_label);
00407 
00408     if (!ret_value)
00409         return false;
00410 
00411     // Update number of features based on highest index
00412     for (int32_t i=0; i<current_length; i++)
00413     {
00414         if (current_vector[i].feat_index > current_num_features)
00415             current_num_features = current_vector[i].feat_index+1;
00416     }
00417     current_vec_index++;
00418 
00419     return true;
00420 }
00421 
00422 template <class T>
00423 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00424 {
00425     current_sgvector.features=current_vector;
00426     current_sgvector.num_feat_entries=current_length;
00427 
00428     return current_sgvector;
00429 }
00430 
00431 template <class T>
00432 float64_t CStreamingSparseFeatures<T>::get_label()
00433 {
00434     ASSERT(has_labels);
00435 
00436     return current_label;
00437 }
00438 
00439 template <class T>
00440 void CStreamingSparseFeatures<T>::release_example()
00441 {
00442     parser.finalize_example();
00443 }
00444 
00445 template <class T>
00446 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00447 {
00448     return current_num_features;
00449 }
00450 
00451 template <class T>
00452     float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00453 {
00454     SG_NOTIMPLEMENTED;
00455     return -1;
00456 }
00457 
00458 template <class T>
00459 int32_t CStreamingSparseFeatures<T>::get_num_features()
00460 {
00461     return current_num_features;
00462 }
00463 
00464 template <class T>
00465 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00466 {
00467     return current_length;
00468 }
00469 
00470 template <class T>
00471 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() const
00472 {
00473     return C_STREAMING_SPARSE;
00474 }
00475 
00476 template class CStreamingSparseFeatures<bool>;
00477 template class CStreamingSparseFeatures<char>;
00478 template class CStreamingSparseFeatures<int8_t>;
00479 template class CStreamingSparseFeatures<uint8_t>;
00480 template class CStreamingSparseFeatures<int16_t>;
00481 template class CStreamingSparseFeatures<uint16_t>;
00482 template class CStreamingSparseFeatures<int32_t>;
00483 template class CStreamingSparseFeatures<uint32_t>;
00484 template class CStreamingSparseFeatures<int64_t>;
00485 template class CStreamingSparseFeatures<uint64_t>;
00486 template class CStreamingSparseFeatures<float32_t>;
00487 template class CStreamingSparseFeatures<float64_t>;
00488 template class CStreamingSparseFeatures<floatmax_t>;
00489 }