SHOGUN: StreamingSparseFeatures.h Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Shashwat Lal Das
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 #ifndef _STREAMING_SPARSEFEATURES__H__
00011 #define _STREAMING_SPARSEFEATURES__H__
00012 
00013 #include <shogun/lib/common.h>
00014 #include <shogun/mathematics/Math.h>
00015 #include <shogun/features/StreamingDotFeatures.h>
00016 #include <shogun/lib/DataType.h>
00017 #include <shogun/io/InputParser.h>
00018 
00019 namespace shogun
00020 {
00043 template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
00044 {
00045 public:
00046 
00054     CStreamingSparseFeatures()
00055         : CStreamingDotFeatures()
00056     {
00057         set_read_functions();
00058         init();
00059     }
00060 
00069     CStreamingSparseFeatures(CStreamingFile* file,
00070                  bool is_labelled,
00071                  int32_t size)
00072         : CStreamingDotFeatures()
00073     {
00074         set_read_functions();
00075         init(file, is_labelled, size);
00076     }
00077 
00083     ~CStreamingSparseFeatures()
00084     {
00085         parser.end_parser();
00086     }
00087 
00097     virtual void set_vector_reader();
00098 
00108     virtual void set_vector_and_label_reader();
00109 
00115     virtual void start_parser();
00116 
00122     virtual void end_parser();
00123 
00132     virtual bool get_next_example();
00133 
00140     T get_feature(int32_t index)
00141     {
00142         ASSERT(index>=0 && index<current_num_features);
00143 
00144         T ret=0;
00145 
00146         if (current_vector)
00147         {
00148             for (int32_t i=0; i<current_length; i++)
00149                 if (current_vector[i].feat_index==index)
00150                     ret += current_vector[i].entry;
00151         }
00152 
00153         return ret;
00154     }
00155 
00161     SGSparseVector<T> get_vector();
00162 
00170     virtual float64_t get_label();
00171 
00178     virtual void release_example();
00179 
00184     virtual void reset_stream()
00185     {
00186         return;
00187     }
00188 
00200     inline int32_t set_num_features(int32_t num)
00201     {
00202         int32_t n=current_num_features;
00203         ASSERT(n<=num);
00204         current_num_features=num;
00205         return n;
00206     }
00207 
00215     virtual int32_t get_dim_feature_space() const;
00216 
00225     inline virtual void expand_if_required(float32_t*& vec, int32_t &len)
00226     {
00227         int32_t dim = get_dim_feature_space();
00228         if (dim > len)
00229         {
00230             vec = SG_REALLOC(float32_t, vec, dim);
00231             memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00232             len = dim;
00233         }
00234     }
00235 
00244     inline virtual void expand_if_required(float64_t*& vec, int32_t &len)
00245     {
00246         int32_t dim = get_dim_feature_space();
00247         if (dim > len)
00248         {
00249             vec = SG_REALLOC(float64_t, vec, dim);
00250             memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00251             len = dim;
00252         }
00253     }
00254 
00265     virtual float32_t dot(CStreamingDotFeatures *df);
00266 
00277     static T sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00278     {
00279         T result=0;
00280 
00281         //result remains zero when one of the vectors is non existent
00282         if (avec && bvec)
00283         {
00284             if (alen<=blen)
00285             {
00286                 int32_t j=0;
00287                 for (int32_t i=0; i<alen; i++)
00288                 {
00289                     int32_t a_feat_idx=avec[i].feat_index;
00290 
00291                     while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00292                         j++;
00293 
00294                     if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00295                     {
00296                         result+= avec[i].entry * bvec[j].entry;
00297                         j++;
00298                     }
00299                 }
00300             }
00301             else
00302             {
00303                 int32_t j=0;
00304                 for (int32_t i=0; i<blen; i++)
00305                 {
00306                     int32_t b_feat_idx=bvec[i].feat_index;
00307 
00308                     while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00309                         j++;
00310 
00311                     if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00312                     {
00313                         result+= bvec[i].entry * avec[j].entry;
00314                         j++;
00315                     }
00316                 }
00317             }
00318 
00319             result*=alpha;
00320         }
00321 
00322         return result;
00323     }
00324 
00334     T dense_dot(T alpha, T* vec, int32_t dim, T b)
00335     {
00336         ASSERT(vec);
00337         ASSERT(dim>=current_num_features);
00338         T result=b;
00339 
00340         int32_t num_feat=current_length;
00341         SGSparseVectorEntry<T>* sv=current_vector;
00342 
00343         if (sv)
00344         {
00345             for (int32_t i=0; i<num_feat; i++)
00346                 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00347         }
00348 
00349         return result;
00350     }
00351 
00360     virtual float64_t dense_dot(const float64_t* vec2, int32_t vec2_len)
00361     {
00362         ASSERT(vec2);
00363         if (vec2_len < current_num_features)
00364         {
00365             SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00366                  vec2_len, current_num_features);
00367         }
00368 
00369         float64_t result=0;
00370         if (current_vector)
00371         {
00372             for (int32_t i=0; i<current_length; i++)
00373                 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00374         }
00375 
00376         return result;
00377     }
00378 
00387     virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len)
00388     {
00389         ASSERT(vec2);
00390         if (vec2_len < current_num_features)
00391         {
00392             SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00393                  vec2_len, current_num_features);
00394         }
00395 
00396         float32_t result=0;
00397         if (current_vector)
00398         {
00399             for (int32_t i=0; i<current_length; i++)
00400                 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00401         }
00402 
00403         return result;
00404     }
00405 
00415     virtual void add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val=false)
00416     {
00417         ASSERT(vec2);
00418         if (vec2_len < current_num_features)
00419         {
00420             SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00421                  vec2_len, current_num_features);
00422         }
00423 
00424         SGSparseVectorEntry<T>* sv=current_vector;
00425         int32_t num_feat=current_length;
00426 
00427         if (sv)
00428         {
00429             if (abs_val)
00430             {
00431                 for (int32_t i=0; i<num_feat; i++)
00432                     vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00433             }
00434             else
00435             {
00436                 for (int32_t i=0; i<num_feat; i++)
00437                     vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00438             }
00439         }
00440     }
00441 
00451     virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val=false)
00452     {
00453         ASSERT(vec2);
00454         if (vec2_len < current_num_features)
00455         {
00456             SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00457                  vec2_len, current_num_features);
00458         }
00459 
00460         SGSparseVectorEntry<T>* sv=current_vector;
00461         int32_t num_feat=current_length;
00462 
00463         if (sv)
00464         {
00465             if (abs_val)
00466             {
00467                 for (int32_t i=0; i<num_feat; i++)
00468                     vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00469             }
00470             else
00471             {
00472                 for (int32_t i=0; i<num_feat; i++)
00473                     vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00474             }
00475         }
00476     }
00477 
00483     int64_t get_num_nonzero_entries()
00484     {
00485         return current_length;
00486     }
00487 
00493     float32_t compute_squared()
00494     {
00495         ASSERT(current_vector);
00496 
00497         float32_t sq=0;
00498 
00499         for (int32_t i=0; i<current_length; i++)
00500             sq += current_vector[i].entry * current_vector[i].entry;
00501 
00502         return sq;
00503     }
00504 
00510     void sort_features()
00511     {
00512         ASSERT(current_vector);
00513 
00514         SGSparseVectorEntry<T>* sf_orig=current_vector;
00515         int32_t len=current_length;
00516 
00517         int32_t* feat_idx=SG_MALLOC(int32_t, len);
00518         int32_t* orig_idx=SG_MALLOC(int32_t, len);
00519 
00520         for (int32_t i=0; i<len; i++)
00521         {
00522             feat_idx[i]=sf_orig[i].feat_index;
00523             orig_idx[i]=i;
00524         }
00525 
00526         CMath::qsort_index(feat_idx, orig_idx, len);
00527 
00528         SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00529 
00530         for (int32_t i=0; i<len; i++)
00531             sf_new[i]=sf_orig[orig_idx[i]];
00532 
00533         // sanity check
00534         for (int32_t i=0; i<len-1; i++)
00535             ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00536 
00537         // Copy new vector back to original
00538         for (int32_t i=0; i<len; i++)
00539             sf_orig[i]=sf_new[i];
00540 
00541         SG_FREE(orig_idx);
00542         SG_FREE(feat_idx);
00543         SG_FREE(sf_new);
00544     }
00545 
00551     virtual int32_t get_num_features();
00552 
00558     virtual int32_t get_nnz_features_for_vector();
00559 
00565     virtual inline EFeatureType get_feature_type();
00566 
00572     virtual EFeatureClass get_feature_class();
00573 
00579     virtual CFeatures* duplicate() const
00580     {
00581         return new CStreamingSparseFeatures<T>(*this);
00582     }
00583 
00589     inline virtual const char* get_name() const { return "StreamingSparseFeatures"; }
00590 
00596     inline virtual int32_t get_num_vectors() const
00597     {
00598         if (current_vector)
00599             return 1;
00600         return 0;
00601     }
00602 
00608     virtual int32_t get_size() { return sizeof(T); }
00609 
00610 private:
00615     virtual void init();
00616 
00624     virtual void init(CStreamingFile *file, bool is_labelled, int32_t size);
00625 
00626 protected:
00628     CInputParser< SGSparseVectorEntry<T> > parser;
00629 
00631     CStreamingFile* working_file;
00632 
00634     SGSparseVector<T> current_sgvector;
00635 
00637     SGSparseVectorEntry<T>* current_vector;
00638 
00640     index_t current_vec_index;
00641 
00643     float64_t current_label;
00644 
00646     int32_t current_length;
00647 
00649     int32_t current_num_features;
00650 };
00651 
00652 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00653 {
00654     parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00655 }
00656 
00657 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00658 {
00659     parser.set_read_vector_and_label
00660         (&CStreamingFile::get_sparse_vector_and_label);
00661 }
00662 
00663 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00664 template<> inline EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \
00665 {                                   \
00666     return f_type;                          \
00667 }
00668 
00669 GET_FEATURE_TYPE(F_BOOL, bool)
00670 GET_FEATURE_TYPE(F_CHAR, char)
00671 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00672 GET_FEATURE_TYPE(F_BYTE, int8_t)
00673 GET_FEATURE_TYPE(F_SHORT, int16_t)
00674 GET_FEATURE_TYPE(F_WORD, uint16_t)
00675 GET_FEATURE_TYPE(F_INT, int32_t)
00676 GET_FEATURE_TYPE(F_UINT, uint32_t)
00677 GET_FEATURE_TYPE(F_LONG, int64_t)
00678 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00679 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00680 GET_FEATURE_TYPE(F_DREAL, float64_t)
00681 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00682 #undef GET_FEATURE_TYPE
00683 
00684 
00685 template <class T>
00686 void CStreamingSparseFeatures<T>::init()
00687 {
00688     working_file=NULL;
00689     current_vector=NULL;
00690     current_length=-1;
00691     current_vec_index=0;
00692     current_num_features=-1;
00693 }
00694 
00695 template <class T>
00696 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00697                     bool is_labelled,
00698                     int32_t size)
00699 {
00700     init();
00701     has_labels = is_labelled;
00702     working_file = file;
00703     parser.init(file, is_labelled, size);
00704 }
00705 
00706 template <class T>
00707 void CStreamingSparseFeatures<T>::start_parser()
00708 {
00709     if (!parser.is_running())
00710         parser.start_parser();
00711 }
00712 
00713 template <class T>
00714 void CStreamingSparseFeatures<T>::end_parser()
00715 {
00716     parser.end_parser();
00717 }
00718 
00719 template <class T>
00720 bool CStreamingSparseFeatures<T>::get_next_example()
00721 {
00722     bool ret_value;
00723     ret_value = (bool) parser.get_next_example(current_vector,
00724                            current_length,
00725                            current_label);
00726 
00727     if (!ret_value)
00728         return false;
00729 
00730     // Update number of features based on highest index
00731     for (int32_t i=0; i<current_length; i++)
00732     {
00733         if (current_vector[i].feat_index > current_num_features)
00734             current_num_features = current_vector[i].feat_index+1;
00735     }
00736     current_vec_index++;
00737 
00738     return true;
00739 }
00740 
00741 template <class T>
00742 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00743 {
00744     current_sgvector.features=current_vector;
00745     current_sgvector.num_feat_entries=current_length;
00746     current_sgvector.vec_index=current_vec_index;
00747 
00748     return current_sgvector;
00749 }
00750 
00751 template <class T>
00752 float64_t CStreamingSparseFeatures<T>::get_label()
00753 {
00754     ASSERT(has_labels);
00755 
00756     return current_label;
00757 }
00758 
00759 template <class T>
00760 void CStreamingSparseFeatures<T>::release_example()
00761 {
00762     parser.finalize_example();
00763 }
00764 
00765 template <class T>
00766 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00767 {
00768     return current_num_features;
00769 }
00770 
00771 template <class T>
00772     float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00773 {
00774     SG_NOTIMPLEMENTED;
00775     return -1;
00776 }
00777 
00778 template <class T>
00779 int32_t CStreamingSparseFeatures<T>::get_num_features()
00780 {
00781     return current_num_features;
00782 }
00783 
00784 template <class T>
00785 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00786 {
00787     return current_length;
00788 }
00789 
00790 template <class T>
00791 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class()
00792 {
00793     return C_STREAMING_SPARSE;
00794 }
00795 
00796 }
00797 #endif // _STREAMING_SPARSEFEATURES__H__