SHOGUN: StreamingSimpleFeatures.h Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Shashwat Lal Das
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 #ifndef _STREAMING_SIMPLEFEATURES__H__
00011 #define _STREAMING_SIMPLEFEATURES__H__
00012 
00013 #include <shogun/lib/common.h>
00014 #include <shogun/mathematics/Math.h>
00015 #include <shogun/features/StreamingDotFeatures.h>
00016 #include <shogun/lib/DataType.h>
00017 #include <shogun/io/StreamingFileFromSimpleFeatures.h>
00018 #include <shogun/io/InputParser.h>
00019 
00020 namespace shogun
00021 {
00027 template <class T> class CStreamingSimpleFeatures : public CStreamingDotFeatures
00028 {
00029 public:
00030 
00038     CStreamingSimpleFeatures()
00039         : CStreamingDotFeatures()
00040     {
00041         set_read_functions();
00042         init();
00043         parser.set_free_vector_after_release(false);
00044     }
00045 
00054     CStreamingSimpleFeatures(CStreamingFile* file,
00055                  bool is_labelled,
00056                  int32_t size)
00057         : CStreamingDotFeatures()
00058     {
00059         init(file, is_labelled, size);
00060         set_read_functions();
00061         parser.set_free_vector_after_release(false);
00062     }
00063 
00071     CStreamingSimpleFeatures(CSimpleFeatures<T>* simple_features,
00072                  float64_t* lab=NULL)
00073         : CStreamingDotFeatures()
00074     {
00075         CStreamingFileFromSimpleFeatures<T>* file;
00076         bool is_labelled;
00077         int32_t size = 1024;
00078 
00079         if (lab)
00080         {
00081             is_labelled = true;
00082             file = new CStreamingFileFromSimpleFeatures<T>(simple_features, lab);
00083         }
00084         else
00085         {
00086             is_labelled = false;
00087             file = new CStreamingFileFromSimpleFeatures<T>(simple_features);
00088         }
00089 
00090         SG_REF(file);
00091 
00092         init(file, is_labelled, size);
00093         set_read_functions();
00094         parser.set_free_vector_after_release(false);
00095         parser.set_free_vectors_on_destruct(false);
00096         seekable=true;
00097     }
00098 
00104     ~CStreamingSimpleFeatures()
00105     {
00106         parser.end_parser();
00107     }
00108 
00118     virtual void set_vector_reader();
00119 
00129     virtual void set_vector_and_label_reader();
00130 
00136     virtual void start_parser();
00137 
00143     virtual void end_parser();
00144 
00149     virtual void reset_stream()
00150     {
00151         if (seekable)
00152         {
00153             ((CStreamingFileFromSimpleFeatures<T>*) working_file)->reset_stream();
00154             parser.exit_parser();
00155             parser.init(working_file, has_labels, 1);
00156             parser.set_free_vector_after_release(false);
00157             parser.start_parser();
00158         }
00159     }
00160 
00169     virtual bool get_next_example();
00170 
00176     SGVector<T> get_vector();
00177 
00185     virtual float64_t get_label();
00186 
00193     virtual void release_example();
00194 
00202     virtual int32_t get_dim_feature_space() const;
00203 
00211     virtual float32_t dot(SGVector<T> vec);
00212 
00223     virtual float32_t dot(CStreamingDotFeatures *df);
00224 
00232     virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len)
00233     {
00234         ASSERT(vec2_len==current_length);
00235         float32_t result=0;
00236 
00237         for (int32_t i=0; i<current_length; i++)
00238             result+=current_vector[i]*vec2[i];
00239 
00240         return result;
00241     }
00242 
00250     virtual float64_t dense_dot(const float64_t* vec2, int32_t vec2_len)
00251     {
00252         ASSERT(vec2_len==current_length);
00253         float64_t result=0;
00254 
00255         for (int32_t i=0; i<current_length; i++)
00256             result+=current_vector[i]*vec2[i];
00257 
00258         return result;
00259     }
00260 
00270     virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len , bool abs_val=false)
00271     {
00272         ASSERT(vec2_len==current_length);
00273 
00274         if (abs_val)
00275         {
00276             for (int32_t i=0; i<current_length; i++)
00277                 vec2[i]+=alpha*CMath::abs(current_vector[i]);
00278         }
00279         else
00280         {
00281             for (int32_t i=0; i<current_length; i++)
00282                 vec2[i]+=alpha*current_vector[i];
00283         }
00284     }
00285 
00295     virtual void add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len , bool abs_val=false)
00296     {
00297         ASSERT(vec2_len==current_length);
00298 
00299         if (abs_val)
00300         {
00301             for (int32_t i=0; i<current_length; i++)
00302                 vec2[i]+=alpha*CMath::abs(current_vector[i]);
00303         }
00304         else
00305         {
00306             for (int32_t i=0; i<current_length; i++)
00307                 vec2[i]+=alpha*current_vector[i];
00308         }
00309     }
00310 
00315     virtual inline int32_t get_nnz_features_for_vector()
00316     {
00317         return current_length;
00318     }
00319 
00325     int32_t get_num_features();
00326 
00332     virtual inline EFeatureType get_feature_type();
00333 
00339     virtual EFeatureClass get_feature_class();
00340 
00346     virtual CFeatures* duplicate() const
00347     {
00348         return new CStreamingSimpleFeatures<T>(*this);
00349     }
00350 
00356     inline virtual const char* get_name() const { return "StreamingSimpleFeatures"; }
00357 
00363     inline virtual int32_t get_num_vectors() const
00364     {
00365         if (current_vector)
00366             return 1;
00367         return 0;
00368     }
00369 
00375     virtual int32_t get_size() { return sizeof(T); }
00376 
00377 private:
00382     void init();
00383 
00391     void init(CStreamingFile *file, bool is_labelled, int32_t size);
00392 
00393 protected:
00394 
00396     float32_t combined_weight;
00397 
00399     CInputParser<T> parser;
00400 
00402     SGVector<T> current_sgvector;
00403 
00405     T* current_vector;
00406 
00408     float64_t current_label;
00409 
00411     int32_t current_length;
00412 };
00413 
00414 template <class T> void CStreamingSimpleFeatures<T>::set_vector_reader()
00415 {
00416     parser.set_read_vector(&CStreamingFile::get_vector);
00417 }
00418 
00419 template <class T> void CStreamingSimpleFeatures<T>::set_vector_and_label_reader()
00420 {
00421     parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
00422 }
00423 
00424 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00425 template<> inline EFeatureType CStreamingSimpleFeatures<sg_type>::get_feature_type() \
00426 {                                   \
00427     return f_type;                          \
00428 }
00429 
00430 GET_FEATURE_TYPE(F_BOOL, bool)
00431 GET_FEATURE_TYPE(F_CHAR, char)
00432 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00433 GET_FEATURE_TYPE(F_BYTE, int8_t)
00434 GET_FEATURE_TYPE(F_SHORT, int16_t)
00435 GET_FEATURE_TYPE(F_WORD, uint16_t)
00436 GET_FEATURE_TYPE(F_INT, int32_t)
00437 GET_FEATURE_TYPE(F_UINT, uint32_t)
00438 GET_FEATURE_TYPE(F_LONG, int64_t)
00439 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00440 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00441 GET_FEATURE_TYPE(F_DREAL, float64_t)
00442 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00443 #undef GET_FEATURE_TYPE
00444 
00445 
00446 template <class T>
00447 void CStreamingSimpleFeatures<T>::init()
00448 {
00449     working_file=NULL;
00450     current_vector=NULL;
00451     seekable=false;
00452     current_length=-1;
00453 }
00454 
00455 template <class T>
00456 void CStreamingSimpleFeatures<T>::init(CStreamingFile* file,
00457                     bool is_labelled,
00458                     int32_t size)
00459 {
00460     init();
00461     has_labels = is_labelled;
00462     working_file = file;
00463     parser.init(file, is_labelled, size);
00464     seekable=false;
00465 }
00466 
00467 template <class T>
00468 void CStreamingSimpleFeatures<T>::start_parser()
00469 {
00470     if (!parser.is_running())
00471         parser.start_parser();
00472 }
00473 
00474 template <class T>
00475 void CStreamingSimpleFeatures<T>::end_parser()
00476 {
00477     parser.end_parser();
00478 }
00479 
00480 template <class T>
00481 bool CStreamingSimpleFeatures<T>::get_next_example()
00482 {
00483     bool ret_value;
00484     ret_value = (bool) parser.get_next_example(current_vector,
00485                            current_length,
00486                            current_label);
00487 
00488     return ret_value;
00489 }
00490 
00491 template <class T>
00492 SGVector<T> CStreamingSimpleFeatures<T>::get_vector()
00493 {
00494     current_sgvector.vector=current_vector;
00495     current_sgvector.vlen=current_length;
00496 
00497     return current_sgvector;
00498 }
00499 
00500 template <class T>
00501 float64_t CStreamingSimpleFeatures<T>::get_label()
00502 {
00503     ASSERT(has_labels);
00504 
00505     return current_label;
00506 }
00507 
00508 template <class T>
00509 void CStreamingSimpleFeatures<T>::release_example()
00510 {
00511     parser.finalize_example();
00512 }
00513 
00514 template <class T>
00515 int32_t CStreamingSimpleFeatures<T>::get_dim_feature_space() const
00516 {
00517     return current_length;
00518 }
00519 
00520 template <class T>
00521     float32_t CStreamingSimpleFeatures<T>::dot(CStreamingDotFeatures* df)
00522 {
00523     ASSERT(df);
00524     ASSERT(df->get_feature_type() == get_feature_type());
00525     ASSERT(df->get_feature_class() == get_feature_class());
00526     CStreamingSimpleFeatures<T>* sf = (CStreamingSimpleFeatures<T>*) df;
00527 
00528     SGVector<T> other_vector=sf->get_vector();
00529 
00530     float32_t result = CMath::dot(current_vector, other_vector.vector, current_length);
00531 
00532     return result;
00533 }
00534 
00535 template <class T>
00536 float32_t CStreamingSimpleFeatures<T>::dot(SGVector<T> sgvec1)
00537 {
00538     int32_t len1;
00539     len1=sgvec1.vlen;
00540 
00541     if (len1 != current_length)
00542         SG_ERROR("Lengths %d and %d not equal while computing dot product!\n", len1, current_length);
00543 
00544     float32_t result=CMath::dot(current_vector, sgvec1.vector, len1);
00545     return result;
00546 }
00547 
00548 template <class T>
00549 int32_t CStreamingSimpleFeatures<T>::get_num_features()
00550 {
00551     return current_length;
00552 }
00553 
00554 template <class T>
00555 EFeatureClass CStreamingSimpleFeatures<T>::get_feature_class()
00556 {
00557     return C_STREAMING_SIMPLE;
00558 }
00559 
00560 }
00561 #endif // _STREAMING_SIMPLEFEATURES__H__