SHOGUN: StreamingSparseFeatures.cpp Source File

Go to the documentation of this file.
00001 #include <shogun/features/StreamingSparseFeatures.h>
00002 namespace shogun
00003 {
00004 
00005 template <class T>
00006 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures()
00007 {
00008     set_read_functions();
00009     init();
00010 }
00011 
00012 template <class T>
00013 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file,
00014              bool is_labelled,
00015              int32_t size)
00016     : CStreamingDotFeatures()
00017 {
00018     set_read_functions();
00019     init(file, is_labelled, size);
00020 }
00021 
00022 template <class T>
00023 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures()
00024 {
00025     parser.end_parser();
00026 }
00027 
00028 template <class T>
00029 T CStreamingSparseFeatures<T>::get_feature(int32_t index)
00030 {
00031     ASSERT(index>=0 && index<current_num_features);
00032 
00033     T ret=0;
00034 
00035     if (current_vector)
00036     {
00037         for (int32_t i=0; i<current_length; i++)
00038             if (current_vector[i].feat_index==index)
00039                 ret += current_vector[i].entry;
00040     }
00041 
00042     return ret;
00043 }
00044 
00045 template <class T>
00046 void CStreamingSparseFeatures<T>::reset_stream()
00047 {
00048 }
00049 
00050 template <class T>
00051 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num)
00052 {
00053     int32_t n=current_num_features;
00054     ASSERT(n<=num);
00055     current_num_features=num;
00056     return n;
00057 }
00058 
00059 template <class T>
00060 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len)
00061 {
00062     int32_t dim = get_dim_feature_space();
00063     if (dim > len)
00064     {
00065         vec = SG_REALLOC(float32_t, vec, dim);
00066         memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00067         len = dim;
00068     }
00069 }
00070 
00071 template <class T>
00072 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len)
00073 {
00074     int32_t dim = get_dim_feature_space();
00075     if (dim > len)
00076     {
00077         vec = SG_REALLOC(float64_t, vec, dim);
00078         memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00079         len = dim;
00080     }
00081 }
00082 
00083 template <class T>
00084 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00085 {
00086     T result=0;
00087 
00088     //result remains zero when one of the vectors is non existent
00089     if (avec && bvec)
00090     {
00091         if (alen<=blen)
00092         {
00093             int32_t j=0;
00094             for (int32_t i=0; i<alen; i++)
00095             {
00096                 int32_t a_feat_idx=avec[i].feat_index;
00097 
00098                 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00099                     j++;
00100 
00101                 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00102                 {
00103                     result+= avec[i].entry * bvec[j].entry;
00104                     j++;
00105                 }
00106             }
00107         }
00108         else
00109         {
00110             int32_t j=0;
00111             for (int32_t i=0; i<blen; i++)
00112             {
00113                 int32_t b_feat_idx=bvec[i].feat_index;
00114 
00115                 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00116                     j++;
00117 
00118                 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00119                 {
00120                     result+= bvec[i].entry * avec[j].entry;
00121                     j++;
00122                 }
00123             }
00124         }
00125 
00126         result*=alpha;
00127     }
00128 
00129     return result;
00130 }
00131 
00132 template <class T>
00133 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00134 {
00135     ASSERT(vec);
00136     ASSERT(dim>=current_num_features);
00137     T result=b;
00138 
00139     int32_t num_feat=current_length;
00140     SGSparseVectorEntry<T>* sv=current_vector;
00141 
00142     if (sv)
00143     {
00144         for (int32_t i=0; i<num_feat; i++)
00145             result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00146     }
00147 
00148     return result;
00149 }
00150 
00151 template <class T>
00152 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len)
00153 {
00154     ASSERT(vec2);
00155     if (vec2_len < current_num_features)
00156     {
00157         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00158              vec2_len, current_num_features);
00159     }
00160 
00161     float64_t result=0;
00162     if (current_vector)
00163     {
00164         for (int32_t i=0; i<current_length; i++)
00165             result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00166     }
00167 
00168     return result;
00169 }
00170 
00171 template <class T>
00172 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len)
00173 {
00174     ASSERT(vec2);
00175     if (vec2_len < current_num_features)
00176     {
00177         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00178              vec2_len, current_num_features);
00179     }
00180 
00181     float32_t result=0;
00182     if (current_vector)
00183     {
00184         for (int32_t i=0; i<current_length; i++)
00185             result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00186     }
00187 
00188     return result;
00189 }
00190 
00191 template <class T>
00192 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
00193 {
00194     ASSERT(vec2);
00195     if (vec2_len < current_num_features)
00196     {
00197         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00198              vec2_len, current_num_features);
00199     }
00200 
00201     SGSparseVectorEntry<T>* sv=current_vector;
00202     int32_t num_feat=current_length;
00203 
00204     if (sv)
00205     {
00206         if (abs_val)
00207         {
00208             for (int32_t i=0; i<num_feat; i++)
00209                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00210         }
00211         else
00212         {
00213             for (int32_t i=0; i<num_feat; i++)
00214                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00215         }
00216     }
00217 }
00218 
00219 template <class T>
00220 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00221 {
00222     ASSERT(vec2);
00223     if (vec2_len < current_num_features)
00224     {
00225         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00226              vec2_len, current_num_features);
00227     }
00228 
00229     SGSparseVectorEntry<T>* sv=current_vector;
00230     int32_t num_feat=current_length;
00231 
00232     if (sv)
00233     {
00234         if (abs_val)
00235         {
00236             for (int32_t i=0; i<num_feat; i++)
00237                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00238         }
00239         else
00240         {
00241             for (int32_t i=0; i<num_feat; i++)
00242                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00243         }
00244     }
00245 }
00246 
00247 template <class T>
00248 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries()
00249 {
00250     return current_length;
00251 }
00252 
00253 template <class T>
00254 float32_t CStreamingSparseFeatures<T>::compute_squared()
00255 {
00256     ASSERT(current_vector);
00257 
00258     float32_t sq=0;
00259 
00260     for (int32_t i=0; i<current_length; i++)
00261         sq += current_vector[i].entry * current_vector[i].entry;
00262 
00263     return sq;
00264 }
00265 
00266 template <class T>
00267 void CStreamingSparseFeatures<T>::sort_features()
00268 {
00269     ASSERT(current_vector);
00270 
00271     SGSparseVectorEntry<T>* sf_orig=current_vector;
00272     int32_t len=current_length;
00273 
00274     int32_t* feat_idx=SG_MALLOC(int32_t, len);
00275     int32_t* orig_idx=SG_MALLOC(int32_t, len);
00276 
00277     for (int32_t i=0; i<len; i++)
00278     {
00279         feat_idx[i]=sf_orig[i].feat_index;
00280         orig_idx[i]=i;
00281     }
00282 
00283     CMath::qsort_index(feat_idx, orig_idx, len);
00284 
00285     SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00286 
00287     for (int32_t i=0; i<len; i++)
00288         sf_new[i]=sf_orig[orig_idx[i]];
00289 
00290     // sanity check
00291     for (int32_t i=0; i<len-1; i++)
00292         ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00293 
00294     // Copy new vector back to original
00295     for (int32_t i=0; i<len; i++)
00296         sf_orig[i]=sf_new[i];
00297 
00298     SG_FREE(orig_idx);
00299     SG_FREE(feat_idx);
00300     SG_FREE(sf_new);
00301 }
00302 
00303 template <class T>
00304 CFeatures* CStreamingSparseFeatures<T>::duplicate() const
00305 {
00306     return new CStreamingSparseFeatures<T>(*this);
00307 }
00308 
00309 template <class T>
00310 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const
00311 {
00312     if (current_vector)
00313         return 1;
00314     return 0;
00315 }
00316 
00317 template <class T>
00318 int32_t CStreamingSparseFeatures<T>::get_size()
00319 {
00320     return sizeof(T);
00321 }
00322 
00323 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00324 {
00325     parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00326 }
00327 
00328 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00329 {
00330     parser.set_read_vector_and_label
00331         (&CStreamingFile::get_sparse_vector_and_label);
00332 }
00333 
00334 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00335 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \
00336 {                                   \
00337     return f_type;                          \
00338 }
00339 
00340 GET_FEATURE_TYPE(F_BOOL, bool)
00341 GET_FEATURE_TYPE(F_CHAR, char)
00342 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00343 GET_FEATURE_TYPE(F_BYTE, int8_t)
00344 GET_FEATURE_TYPE(F_SHORT, int16_t)
00345 GET_FEATURE_TYPE(F_WORD, uint16_t)
00346 GET_FEATURE_TYPE(F_INT, int32_t)
00347 GET_FEATURE_TYPE(F_UINT, uint32_t)
00348 GET_FEATURE_TYPE(F_LONG, int64_t)
00349 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00350 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00351 GET_FEATURE_TYPE(F_DREAL, float64_t)
00352 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00353 #undef GET_FEATURE_TYPE
00354 
00355 
00356 template <class T>
00357 void CStreamingSparseFeatures<T>::init()
00358 {
00359     working_file=NULL;
00360     current_vector=NULL;
00361     current_length=-1;
00362     current_vec_index=0;
00363     current_num_features=-1;
00364 }
00365 
00366 template <class T>
00367 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00368                     bool is_labelled,
00369                     int32_t size)
00370 {
00371     init();
00372     has_labels = is_labelled;
00373     working_file = file;
00374     parser.init(file, is_labelled, size);
00375 }
00376 
00377 template <class T>
00378 void CStreamingSparseFeatures<T>::start_parser()
00379 {
00380     if (!parser.is_running())
00381         parser.start_parser();
00382 }
00383 
00384 template <class T>
00385 void CStreamingSparseFeatures<T>::end_parser()
00386 {
00387     parser.end_parser();
00388 }
00389 
00390 template <class T>
00391 bool CStreamingSparseFeatures<T>::get_next_example()
00392 {
00393     bool ret_value;
00394     ret_value = (bool) parser.get_next_example(current_vector,
00395                            current_length,
00396                            current_label);
00397 
00398     if (!ret_value)
00399         return false;
00400 
00401     // Update number of features based on highest index
00402     for (int32_t i=0; i<current_length; i++)
00403     {
00404         if (current_vector[i].feat_index > current_num_features)
00405             current_num_features = current_vector[i].feat_index+1;
00406     }
00407     current_vec_index++;
00408 
00409     return true;
00410 }
00411 
00412 template <class T>
00413 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00414 {
00415     current_sgvector.features=current_vector;
00416     current_sgvector.num_feat_entries=current_length;
00417     current_sgvector.vec_index=current_vec_index;
00418 
00419     return current_sgvector;
00420 }
00421 
00422 template <class T>
00423 float64_t CStreamingSparseFeatures<T>::get_label()
00424 {
00425     ASSERT(has_labels);
00426 
00427     return current_label;
00428 }
00429 
00430 template <class T>
00431 void CStreamingSparseFeatures<T>::release_example()
00432 {
00433     parser.finalize_example();
00434 }
00435 
00436 template <class T>
00437 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00438 {
00439     return current_num_features;
00440 }
00441 
00442 template <class T>
00443     float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00444 {
00445     SG_NOTIMPLEMENTED;
00446     return -1;
00447 }
00448 
00449 template <class T>
00450 int32_t CStreamingSparseFeatures<T>::get_num_features()
00451 {
00452     return current_num_features;
00453 }
00454 
00455 template <class T>
00456 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00457 {
00458     return current_length;
00459 }
00460 
00461 template <class T>
00462 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class()
00463 {
00464     return C_STREAMING_SPARSE;
00465 }
00466 
00467 template class CStreamingSparseFeatures<bool>;
00468 template class CStreamingSparseFeatures<char>;
00469 template class CStreamingSparseFeatures<int8_t>;
00470 template class CStreamingSparseFeatures<uint8_t>;
00471 template class CStreamingSparseFeatures<int16_t>;
00472 template class CStreamingSparseFeatures<uint16_t>;
00473 template class CStreamingSparseFeatures<int32_t>;
00474 template class CStreamingSparseFeatures<uint32_t>;
00475 template class CStreamingSparseFeatures<int64_t>;
00476 template class CStreamingSparseFeatures<uint64_t>;
00477 template class CStreamingSparseFeatures<float32_t>;
00478 template class CStreamingSparseFeatures<float64_t>;
00479 template class CStreamingSparseFeatures<floatmax_t>;
00480 }