00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef _STREAMING_SPARSEFEATURES__H__
00011 #define _STREAMING_SPARSEFEATURES__H__
00012
00013 #include <shogun/lib/common.h>
00014 #include <shogun/mathematics/Math.h>
00015 #include <shogun/features/StreamingDotFeatures.h>
00016 #include <shogun/lib/DataType.h>
00017 #include <shogun/io/InputParser.h>
00018
00019 namespace shogun
00020 {
00043 template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
00044 {
00045 public:
00046
00054 CStreamingSparseFeatures()
00055 : CStreamingDotFeatures()
00056 {
00057 set_read_functions();
00058 init();
00059 }
00060
00069 CStreamingSparseFeatures(CStreamingFile* file,
00070 bool is_labelled,
00071 int32_t size)
00072 : CStreamingDotFeatures()
00073 {
00074 set_read_functions();
00075 init(file, is_labelled, size);
00076 }
00077
00083 ~CStreamingSparseFeatures()
00084 {
00085 parser.end_parser();
00086 }
00087
00097 virtual void set_vector_reader();
00098
00108 virtual void set_vector_and_label_reader();
00109
00115 virtual void start_parser();
00116
00122 virtual void end_parser();
00123
00132 virtual bool get_next_example();
00133
00140 T get_feature(int32_t index)
00141 {
00142 ASSERT(index>=0 && index<current_num_features);
00143
00144 T ret=0;
00145
00146 if (current_vector)
00147 {
00148 for (int32_t i=0; i<current_length; i++)
00149 if (current_vector[i].feat_index==index)
00150 ret += current_vector[i].entry;
00151 }
00152
00153 return ret;
00154 }
00155
00161 SGSparseVector<T> get_vector();
00162
00170 virtual float64_t get_label();
00171
00178 virtual void release_example();
00179
00184 virtual void reset_stream()
00185 {
00186 return;
00187 }
00188
00200 inline int32_t set_num_features(int32_t num)
00201 {
00202 int32_t n=current_num_features;
00203 ASSERT(n<=num);
00204 current_num_features=num;
00205 return n;
00206 }
00207
00215 virtual int32_t get_dim_feature_space() const;
00216
00225 inline virtual void expand_if_required(float32_t*& vec, int32_t &len)
00226 {
00227 int32_t dim = get_dim_feature_space();
00228 if (dim > len)
00229 {
00230 vec = SG_REALLOC(float32_t, vec, dim);
00231 memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00232 len = dim;
00233 }
00234 }
00235
00244 inline virtual void expand_if_required(float64_t*& vec, int32_t &len)
00245 {
00246 int32_t dim = get_dim_feature_space();
00247 if (dim > len)
00248 {
00249 vec = SG_REALLOC(float64_t, vec, dim);
00250 memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00251 len = dim;
00252 }
00253 }
00254
00265 virtual float32_t dot(CStreamingDotFeatures *df);
00266
00277 static T sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00278 {
00279 T result=0;
00280
00281
00282 if (avec && bvec)
00283 {
00284 if (alen<=blen)
00285 {
00286 int32_t j=0;
00287 for (int32_t i=0; i<alen; i++)
00288 {
00289 int32_t a_feat_idx=avec[i].feat_index;
00290
00291 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00292 j++;
00293
00294 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00295 {
00296 result+= avec[i].entry * bvec[j].entry;
00297 j++;
00298 }
00299 }
00300 }
00301 else
00302 {
00303 int32_t j=0;
00304 for (int32_t i=0; i<blen; i++)
00305 {
00306 int32_t b_feat_idx=bvec[i].feat_index;
00307
00308 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00309 j++;
00310
00311 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00312 {
00313 result+= bvec[i].entry * avec[j].entry;
00314 j++;
00315 }
00316 }
00317 }
00318
00319 result*=alpha;
00320 }
00321
00322 return result;
00323 }
00324
00334 T dense_dot(T alpha, T* vec, int32_t dim, T b)
00335 {
00336 ASSERT(vec);
00337 ASSERT(dim>=current_num_features);
00338 T result=b;
00339
00340 int32_t num_feat=current_length;
00341 SGSparseVectorEntry<T>* sv=current_vector;
00342
00343 if (sv)
00344 {
00345 for (int32_t i=0; i<num_feat; i++)
00346 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00347 }
00348
00349 return result;
00350 }
00351
00360 virtual float64_t dense_dot(const float64_t* vec2, int32_t vec2_len)
00361 {
00362 ASSERT(vec2);
00363 if (vec2_len < current_num_features)
00364 {
00365 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00366 vec2_len, current_num_features);
00367 }
00368
00369 float64_t result=0;
00370 if (current_vector)
00371 {
00372 for (int32_t i=0; i<current_length; i++)
00373 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00374 }
00375
00376 return result;
00377 }
00378
00387 virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len)
00388 {
00389 ASSERT(vec2);
00390 if (vec2_len < current_num_features)
00391 {
00392 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00393 vec2_len, current_num_features);
00394 }
00395
00396 float32_t result=0;
00397 if (current_vector)
00398 {
00399 for (int32_t i=0; i<current_length; i++)
00400 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00401 }
00402
00403 return result;
00404 }
00405
00415 virtual void add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val=false)
00416 {
00417 ASSERT(vec2);
00418 if (vec2_len < current_num_features)
00419 {
00420 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00421 vec2_len, current_num_features);
00422 }
00423
00424 SGSparseVectorEntry<T>* sv=current_vector;
00425 int32_t num_feat=current_length;
00426
00427 if (sv)
00428 {
00429 if (abs_val)
00430 {
00431 for (int32_t i=0; i<num_feat; i++)
00432 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00433 }
00434 else
00435 {
00436 for (int32_t i=0; i<num_feat; i++)
00437 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00438 }
00439 }
00440 }
00441
00451 virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val=false)
00452 {
00453 ASSERT(vec2);
00454 if (vec2_len < current_num_features)
00455 {
00456 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00457 vec2_len, current_num_features);
00458 }
00459
00460 SGSparseVectorEntry<T>* sv=current_vector;
00461 int32_t num_feat=current_length;
00462
00463 if (sv)
00464 {
00465 if (abs_val)
00466 {
00467 for (int32_t i=0; i<num_feat; i++)
00468 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00469 }
00470 else
00471 {
00472 for (int32_t i=0; i<num_feat; i++)
00473 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00474 }
00475 }
00476 }
00477
00483 int64_t get_num_nonzero_entries()
00484 {
00485 return current_length;
00486 }
00487
00493 float32_t compute_squared()
00494 {
00495 ASSERT(current_vector);
00496
00497 float32_t sq=0;
00498
00499 for (int32_t i=0; i<current_length; i++)
00500 sq += current_vector[i].entry * current_vector[i].entry;
00501
00502 return sq;
00503 }
00504
00510 void sort_features()
00511 {
00512 ASSERT(current_vector);
00513
00514 SGSparseVectorEntry<T>* sf_orig=current_vector;
00515 int32_t len=current_length;
00516
00517 int32_t* feat_idx=SG_MALLOC(int32_t, len);
00518 int32_t* orig_idx=SG_MALLOC(int32_t, len);
00519
00520 for (int32_t i=0; i<len; i++)
00521 {
00522 feat_idx[i]=sf_orig[i].feat_index;
00523 orig_idx[i]=i;
00524 }
00525
00526 CMath::qsort_index(feat_idx, orig_idx, len);
00527
00528 SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len);
00529
00530 for (int32_t i=0; i<len; i++)
00531 sf_new[i]=sf_orig[orig_idx[i]];
00532
00533
00534 for (int32_t i=0; i<len-1; i++)
00535 ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
00536
00537
00538 for (int32_t i=0; i<len; i++)
00539 sf_orig[i]=sf_new[i];
00540
00541 SG_FREE(orig_idx);
00542 SG_FREE(feat_idx);
00543 SG_FREE(sf_new);
00544 }
00545
00551 virtual int32_t get_num_features();
00552
00558 virtual int32_t get_nnz_features_for_vector();
00559
00565 virtual inline EFeatureType get_feature_type();
00566
00572 virtual EFeatureClass get_feature_class();
00573
00579 virtual CFeatures* duplicate() const
00580 {
00581 return new CStreamingSparseFeatures<T>(*this);
00582 }
00583
00589 inline virtual const char* get_name() const { return "StreamingSparseFeatures"; }
00590
00596 inline virtual int32_t get_num_vectors() const
00597 {
00598 if (current_vector)
00599 return 1;
00600 return 0;
00601 }
00602
00608 virtual int32_t get_size() { return sizeof(T); }
00609
00610 private:
00615 virtual void init();
00616
00624 virtual void init(CStreamingFile *file, bool is_labelled, int32_t size);
00625
00626 protected:
00628 CInputParser< SGSparseVectorEntry<T> > parser;
00629
00631 CStreamingFile* working_file;
00632
00634 SGSparseVector<T> current_sgvector;
00635
00637 SGSparseVectorEntry<T>* current_vector;
00638
00640 index_t current_vec_index;
00641
00643 float64_t current_label;
00644
00646 int32_t current_length;
00647
00649 int32_t current_num_features;
00650 };
00651
00652 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00653 {
00654 parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00655 }
00656
00657 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00658 {
00659 parser.set_read_vector_and_label
00660 (&CStreamingFile::get_sparse_vector_and_label);
00661 }
00662
00663 #define GET_FEATURE_TYPE(f_type, sg_type) \
00664 template<> inline EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \
00665 { \
00666 return f_type; \
00667 }
00668
00669 GET_FEATURE_TYPE(F_BOOL, bool)
00670 GET_FEATURE_TYPE(F_CHAR, char)
00671 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00672 GET_FEATURE_TYPE(F_BYTE, int8_t)
00673 GET_FEATURE_TYPE(F_SHORT, int16_t)
00674 GET_FEATURE_TYPE(F_WORD, uint16_t)
00675 GET_FEATURE_TYPE(F_INT, int32_t)
00676 GET_FEATURE_TYPE(F_UINT, uint32_t)
00677 GET_FEATURE_TYPE(F_LONG, int64_t)
00678 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00679 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00680 GET_FEATURE_TYPE(F_DREAL, float64_t)
00681 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00682 #undef GET_FEATURE_TYPE
00683
00684
00685 template <class T>
00686 void CStreamingSparseFeatures<T>::init()
00687 {
00688 working_file=NULL;
00689 current_vector=NULL;
00690 current_length=-1;
00691 current_vec_index=0;
00692 current_num_features=-1;
00693 }
00694
00695 template <class T>
00696 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00697 bool is_labelled,
00698 int32_t size)
00699 {
00700 init();
00701 has_labels = is_labelled;
00702 working_file = file;
00703 parser.init(file, is_labelled, size);
00704 }
00705
00706 template <class T>
00707 void CStreamingSparseFeatures<T>::start_parser()
00708 {
00709 if (!parser.is_running())
00710 parser.start_parser();
00711 }
00712
00713 template <class T>
00714 void CStreamingSparseFeatures<T>::end_parser()
00715 {
00716 parser.end_parser();
00717 }
00718
00719 template <class T>
00720 bool CStreamingSparseFeatures<T>::get_next_example()
00721 {
00722 bool ret_value;
00723 ret_value = (bool) parser.get_next_example(current_vector,
00724 current_length,
00725 current_label);
00726
00727 if (!ret_value)
00728 return false;
00729
00730
00731 for (int32_t i=0; i<current_length; i++)
00732 {
00733 if (current_vector[i].feat_index > current_num_features)
00734 current_num_features = current_vector[i].feat_index+1;
00735 }
00736 current_vec_index++;
00737
00738 return true;
00739 }
00740
00741 template <class T>
00742 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00743 {
00744 current_sgvector.features=current_vector;
00745 current_sgvector.num_feat_entries=current_length;
00746 current_sgvector.vec_index=current_vec_index;
00747
00748 return current_sgvector;
00749 }
00750
00751 template <class T>
00752 float64_t CStreamingSparseFeatures<T>::get_label()
00753 {
00754 ASSERT(has_labels);
00755
00756 return current_label;
00757 }
00758
00759 template <class T>
00760 void CStreamingSparseFeatures<T>::release_example()
00761 {
00762 parser.finalize_example();
00763 }
00764
00765 template <class T>
00766 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00767 {
00768 return current_num_features;
00769 }
00770
00771 template <class T>
00772 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00773 {
00774 SG_NOTIMPLEMENTED;
00775 return -1;
00776 }
00777
00778 template <class T>
00779 int32_t CStreamingSparseFeatures<T>::get_num_features()
00780 {
00781 return current_num_features;
00782 }
00783
00784 template <class T>
00785 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00786 {
00787 return current_length;
00788 }
00789
00790 template <class T>
00791 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class()
00792 {
00793 return C_STREAMING_SPARSE;
00794 }
00795
00796 }
00797 #endif // _STREAMING_SPARSEFEATURES__H__