StreamingStringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Shashwat Lal Das
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 #ifndef _STREAMING_STRINGFEATURES__H__
00011 #define _STREAMING_STRINGFEATURES__H__
00012 
00013 #include <shogun/lib/common.h>
00014 #include <shogun/mathematics/Math.h>
00015 #include <shogun/base/Parameter.h>
00016 #include <shogun/lib/DataType.h>
00017 #include <shogun/io/InputParser.h>
00018 
00019 #include <shogun/features/StreamingFeatures.h>
00020 #include <shogun/features/Alphabet.h>
00021 
00022 namespace shogun
00023 {
00027 template <class T> class CStreamingStringFeatures : public CStreamingFeatures
00028 {
00029 public:
00030 
00038     CStreamingStringFeatures()
00039         : CStreamingFeatures()
00040     {
00041         init();
00042         set_read_functions();
00043         remap_to_bin=false;
00044     }
00045 
00054     CStreamingStringFeatures(CStreamingFile* file,
00055                  bool is_labelled,
00056                  int32_t size)
00057         : CStreamingFeatures()
00058     {
00059         init(file, is_labelled, size);
00060         set_read_functions();
00061         remap_to_bin=false;
00062     }
00063 
00069     virtual ~CStreamingStringFeatures()
00070     {
00071         parser.end_parser();
00072         SG_UNREF(alphabet);
00073     }
00074 
00084     virtual void set_vector_reader();
00085 
00095     virtual void set_vector_and_label_reader();
00096 
00103     void use_alphabet(EAlphabet alpha)
00104     {
00105         SG_UNREF(alphabet);
00106 
00107         alphabet=new CAlphabet(alpha);
00108         SG_REF(alphabet);
00109         num_symbols=alphabet->get_num_symbols();
00110     }
00111 
00118     void use_alphabet(CAlphabet* alpha)
00119     {
00120         SG_UNREF(alphabet);
00121 
00122         alphabet=new CAlphabet(alpha);
00123         SG_REF(alphabet);
00124         num_symbols=alphabet->get_num_symbols();
00125     }
00126 
00134     void set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet)
00135     {
00136         remap_to_bin=true;
00137         alpha_ascii=new CAlphabet(ascii_alphabet);
00138         alpha_bin=new CAlphabet(binary_alphabet);
00139     }
00140 
00148     void set_remap(EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00149     {
00150         remap_to_bin=true;
00151         alpha_ascii=new CAlphabet(ascii_alphabet);
00152         alpha_bin=new CAlphabet(binary_alphabet);
00153     }
00154 
00159     CAlphabet* get_alphabet()
00160     {
00161         SG_REF(alphabet);
00162         return alphabet;
00163     }
00164     
00171     floatmax_t get_num_symbols()
00172     {
00173         return num_symbols;
00174     }
00175 
00181     virtual void start_parser();
00182 
00188     virtual void end_parser();
00189 
00198     virtual bool get_next_example();
00199 
00205     SGString<T> get_vector();
00206 
00214     virtual float64_t get_label();
00215 
00222     virtual void release_example();
00223 
00229     virtual int32_t get_vector_length();
00230 
00236     virtual inline EFeatureType get_feature_type();
00237 
00243     virtual EFeatureClass get_feature_class();
00244 
00250     virtual CFeatures* duplicate() const
00251     {
00252         return new CStreamingStringFeatures<T>(*this);
00253     }
00254 
00260     inline virtual const char* get_name() const { return "StreamingStringFeatures"; }
00261 
00267     inline virtual int32_t get_num_vectors() const
00268     {
00269         if (current_string)
00270             return 1;
00271         return 0;
00272     }
00273 
00279     virtual int32_t get_size() { return sizeof(T); }
00280 
00286     virtual int32_t get_num_features() { return current_length; }
00287 
00288 private:
00289 
00294     void init();
00295     
00303     void init(CStreamingFile *file, bool is_labelled, int32_t size);
00304 
00305 protected:
00306 
00308     CInputParser<T> parser;
00309 
00311     CAlphabet* alphabet;
00312 
00314     CAlphabet* alpha_ascii;
00315 
00317     CAlphabet* alpha_bin;
00318 
00320     CStreamingFile* working_file;
00321 
00323     SGString<T> current_sgstring;
00324 
00326     T* current_string;
00327 
00329     int32_t current_length;
00330 
00332     float64_t current_label;
00333 
00335     bool has_labels;
00336 
00338     bool remap_to_bin;
00339 
00341     int32_t num_symbols;
00342 };
00343 
00344 template <class T> void CStreamingStringFeatures<T>::set_vector_reader()
00345 {
00346     parser.set_read_vector(&CStreamingFile::get_string);
00347 }
00348 
00349 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader()
00350 {
00351     parser.set_read_vector_and_label
00352         (&CStreamingFile::get_string_and_label);
00353 }
00354 
00355 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00356 template<> inline EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() \
00357 {                                   \
00358     return f_type;                          \
00359 }
00360 
00361 GET_FEATURE_TYPE(F_BOOL, bool)
00362 GET_FEATURE_TYPE(F_CHAR, char)
00363 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00364 GET_FEATURE_TYPE(F_BYTE, int8_t)
00365 GET_FEATURE_TYPE(F_SHORT, int16_t)
00366 GET_FEATURE_TYPE(F_WORD, uint16_t)
00367 GET_FEATURE_TYPE(F_INT, int32_t)
00368 GET_FEATURE_TYPE(F_UINT, uint32_t)
00369 GET_FEATURE_TYPE(F_LONG, int64_t)
00370 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00371 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00372 GET_FEATURE_TYPE(F_DREAL, float64_t)
00373 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00374 #undef GET_FEATURE_TYPE
00375 
00376     
00377 template <class T>
00378 void CStreamingStringFeatures<T>::init()
00379 {
00380     working_file=NULL;
00381     alphabet=new CAlphabet();
00382 
00383     current_string=NULL;
00384     current_length=-1;
00385     current_sgstring.string=current_string;
00386     current_sgstring.slen=current_length;
00387 }
00388 
00389 template <class T>
00390 void CStreamingStringFeatures<T>::init(CStreamingFile* file,
00391                        bool is_labelled,
00392                        int32_t size)
00393 {
00394     init();
00395     has_labels=is_labelled;
00396     working_file=file;
00397     parser.init(file, is_labelled, size);
00398     parser.set_free_vector_after_release(false);
00399     parser.set_free_vectors_on_destruct(false);
00400 }
00401     
00402 template <class T>
00403 void CStreamingStringFeatures<T>::start_parser()
00404 {
00405     if (!remap_to_bin)
00406         alpha_ascii=alphabet;
00407     
00408     if (!parser.is_running())
00409         parser.start_parser();
00410 }
00411 
00412 template <class T>
00413 void CStreamingStringFeatures<T>::end_parser()
00414 {
00415     parser.end_parser();
00416 }
00417 
00418 template <class T>
00419 bool CStreamingStringFeatures<T>::get_next_example()
00420 {
00421     bool ret_value;
00422     
00423     ret_value = (bool) parser.get_next_example(current_string,
00424                            current_length,
00425                            current_label);
00426 
00427     if (!ret_value)
00428         return false;
00429     
00430     int32_t i;
00431     if (remap_to_bin)
00432     {
00433         alpha_ascii->add_string_to_histogram(current_string, current_length);
00434 
00435         for (i=0; i<current_length; i++)
00436             current_string[i]=alpha_ascii->remap_to_bin(current_string[i]);
00437         alpha_bin->add_string_to_histogram(current_string, current_length);
00438     }
00439     else
00440     {
00441         alpha_ascii->add_string_to_histogram(current_string, current_length);
00442     }
00443 
00444     /* Check the input using src alphabet, alpha_ascii */
00445     if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) )
00446     {
00447         SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n");
00448         return 0;
00449     }
00450 
00451     //SG_UNREF(alphabet);
00452 
00453     if (remap_to_bin)
00454         alphabet=alpha_bin;
00455     else
00456         alphabet=alpha_ascii;
00457     
00458     //SG_REF(alphabet);
00459     num_symbols=alphabet->get_num_symbols();
00460             
00461     return ret_value;
00462 }
00463 
00464 template <class T>
00465 SGString<T> CStreamingStringFeatures<T>::get_vector()
00466 {
00467     current_sgstring.string=current_string;
00468     current_sgstring.slen=current_length;
00469 
00470     return current_sgstring;
00471 }
00472 
00473 template <class T>
00474 float64_t CStreamingStringFeatures<T>::get_label()
00475 {
00476     ASSERT(has_labels);
00477 
00478     return current_label;
00479 }
00480     
00481 template <class T>
00482 void CStreamingStringFeatures<T>::release_example()
00483 {
00484     parser.finalize_example();
00485 }
00486 
00487 template <class T>
00488 int32_t CStreamingStringFeatures<T>::get_vector_length()
00489 {
00490     return current_length;
00491 }
00492 
00493 template <class T>
00494 EFeatureClass CStreamingStringFeatures<T>::get_feature_class()
00495 {
00496     return C_STREAMING_STRING;
00497 }
00498 
00499 }
00500 #endif // _STREAMING_STRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation