Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef _STREAMING_STRINGFEATURES__H__
00011 #define _STREAMING_STRINGFEATURES__H__
00012
00013 #include <shogun/lib/common.h>
00014 #include <shogun/mathematics/Math.h>
00015 #include <shogun/base/Parameter.h>
00016 #include <shogun/lib/DataType.h>
00017 #include <shogun/io/InputParser.h>
00018
00019 #include <shogun/features/StreamingFeatures.h>
00020 #include <shogun/features/Alphabet.h>
00021
00022 namespace shogun
00023 {
00027 template <class T> class CStreamingStringFeatures : public CStreamingFeatures
00028 {
00029 public:
00030
00038 CStreamingStringFeatures()
00039 : CStreamingFeatures()
00040 {
00041 init();
00042 set_read_functions();
00043 remap_to_bin=false;
00044 }
00045
00054 CStreamingStringFeatures(CStreamingFile* file,
00055 bool is_labelled,
00056 int32_t size)
00057 : CStreamingFeatures()
00058 {
00059 init(file, is_labelled, size);
00060 set_read_functions();
00061 remap_to_bin=false;
00062 }
00063
00069 virtual ~CStreamingStringFeatures()
00070 {
00071 parser.end_parser();
00072 SG_UNREF(alphabet);
00073 }
00074
00084 virtual void set_vector_reader();
00085
00095 virtual void set_vector_and_label_reader();
00096
00103 void use_alphabet(EAlphabet alpha)
00104 {
00105 SG_UNREF(alphabet);
00106
00107 alphabet=new CAlphabet(alpha);
00108 SG_REF(alphabet);
00109 num_symbols=alphabet->get_num_symbols();
00110 }
00111
00118 void use_alphabet(CAlphabet* alpha)
00119 {
00120 SG_UNREF(alphabet);
00121
00122 alphabet=new CAlphabet(alpha);
00123 SG_REF(alphabet);
00124 num_symbols=alphabet->get_num_symbols();
00125 }
00126
00134 void set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet)
00135 {
00136 remap_to_bin=true;
00137 alpha_ascii=new CAlphabet(ascii_alphabet);
00138 alpha_bin=new CAlphabet(binary_alphabet);
00139 }
00140
00148 void set_remap(EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00149 {
00150 remap_to_bin=true;
00151 alpha_ascii=new CAlphabet(ascii_alphabet);
00152 alpha_bin=new CAlphabet(binary_alphabet);
00153 }
00154
00159 CAlphabet* get_alphabet()
00160 {
00161 SG_REF(alphabet);
00162 return alphabet;
00163 }
00164
00171 floatmax_t get_num_symbols()
00172 {
00173 return num_symbols;
00174 }
00175
00181 virtual void start_parser();
00182
00188 virtual void end_parser();
00189
00198 virtual bool get_next_example();
00199
00205 SGString<T> get_vector();
00206
00214 virtual float64_t get_label();
00215
00222 virtual void release_example();
00223
00229 virtual int32_t get_vector_length();
00230
00236 virtual inline EFeatureType get_feature_type();
00237
00243 virtual EFeatureClass get_feature_class();
00244
00250 virtual CFeatures* duplicate() const
00251 {
00252 return new CStreamingStringFeatures<T>(*this);
00253 }
00254
00260 inline virtual const char* get_name() const { return "StreamingStringFeatures"; }
00261
00267 inline virtual int32_t get_num_vectors() const
00268 {
00269 if (current_string)
00270 return 1;
00271 return 0;
00272 }
00273
00279 virtual int32_t get_size() { return sizeof(T); }
00280
00286 virtual int32_t get_num_features() { return current_length; }
00287
00288 private:
00289
00294 void init();
00295
00303 void init(CStreamingFile *file, bool is_labelled, int32_t size);
00304
00305 protected:
00306
00308 CInputParser<T> parser;
00309
00311 CAlphabet* alphabet;
00312
00314 CAlphabet* alpha_ascii;
00315
00317 CAlphabet* alpha_bin;
00318
00320 CStreamingFile* working_file;
00321
00323 SGString<T> current_sgstring;
00324
00326 T* current_string;
00327
00329 int32_t current_length;
00330
00332 float64_t current_label;
00333
00335 bool has_labels;
00336
00338 bool remap_to_bin;
00339
00341 int32_t num_symbols;
00342 };
00343
00344 template <class T> void CStreamingStringFeatures<T>::set_vector_reader()
00345 {
00346 parser.set_read_vector(&CStreamingFile::get_string);
00347 }
00348
00349 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader()
00350 {
00351 parser.set_read_vector_and_label
00352 (&CStreamingFile::get_string_and_label);
00353 }
00354
00355 #define GET_FEATURE_TYPE(f_type, sg_type) \
00356 template<> inline EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() \
00357 { \
00358 return f_type; \
00359 }
00360
00361 GET_FEATURE_TYPE(F_BOOL, bool)
00362 GET_FEATURE_TYPE(F_CHAR, char)
00363 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00364 GET_FEATURE_TYPE(F_BYTE, int8_t)
00365 GET_FEATURE_TYPE(F_SHORT, int16_t)
00366 GET_FEATURE_TYPE(F_WORD, uint16_t)
00367 GET_FEATURE_TYPE(F_INT, int32_t)
00368 GET_FEATURE_TYPE(F_UINT, uint32_t)
00369 GET_FEATURE_TYPE(F_LONG, int64_t)
00370 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00371 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00372 GET_FEATURE_TYPE(F_DREAL, float64_t)
00373 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00374 #undef GET_FEATURE_TYPE
00375
00376
00377 template <class T>
00378 void CStreamingStringFeatures<T>::init()
00379 {
00380 working_file=NULL;
00381 alphabet=new CAlphabet();
00382
00383 current_string=NULL;
00384 current_length=-1;
00385 current_sgstring.string=current_string;
00386 current_sgstring.slen=current_length;
00387 }
00388
00389 template <class T>
00390 void CStreamingStringFeatures<T>::init(CStreamingFile* file,
00391 bool is_labelled,
00392 int32_t size)
00393 {
00394 init();
00395 has_labels=is_labelled;
00396 working_file=file;
00397 parser.init(file, is_labelled, size);
00398 parser.set_free_vector_after_release(false);
00399 parser.set_free_vectors_on_destruct(false);
00400 }
00401
00402 template <class T>
00403 void CStreamingStringFeatures<T>::start_parser()
00404 {
00405 if (!remap_to_bin)
00406 alpha_ascii=alphabet;
00407
00408 if (!parser.is_running())
00409 parser.start_parser();
00410 }
00411
00412 template <class T>
00413 void CStreamingStringFeatures<T>::end_parser()
00414 {
00415 parser.end_parser();
00416 }
00417
00418 template <class T>
00419 bool CStreamingStringFeatures<T>::get_next_example()
00420 {
00421 bool ret_value;
00422
00423 ret_value = (bool) parser.get_next_example(current_string,
00424 current_length,
00425 current_label);
00426
00427 if (!ret_value)
00428 return false;
00429
00430 int32_t i;
00431 if (remap_to_bin)
00432 {
00433 alpha_ascii->add_string_to_histogram(current_string, current_length);
00434
00435 for (i=0; i<current_length; i++)
00436 current_string[i]=alpha_ascii->remap_to_bin(current_string[i]);
00437 alpha_bin->add_string_to_histogram(current_string, current_length);
00438 }
00439 else
00440 {
00441 alpha_ascii->add_string_to_histogram(current_string, current_length);
00442 }
00443
00444
00445 if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) )
00446 {
00447 SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n");
00448 return 0;
00449 }
00450
00451
00452
00453 if (remap_to_bin)
00454 alphabet=alpha_bin;
00455 else
00456 alphabet=alpha_ascii;
00457
00458
00459 num_symbols=alphabet->get_num_symbols();
00460
00461 return ret_value;
00462 }
00463
00464 template <class T>
00465 SGString<T> CStreamingStringFeatures<T>::get_vector()
00466 {
00467 current_sgstring.string=current_string;
00468 current_sgstring.slen=current_length;
00469
00470 return current_sgstring;
00471 }
00472
00473 template <class T>
00474 float64_t CStreamingStringFeatures<T>::get_label()
00475 {
00476 ASSERT(has_labels);
00477
00478 return current_label;
00479 }
00480
00481 template <class T>
00482 void CStreamingStringFeatures<T>::release_example()
00483 {
00484 parser.finalize_example();
00485 }
00486
00487 template <class T>
00488 int32_t CStreamingStringFeatures<T>::get_vector_length()
00489 {
00490 return current_length;
00491 }
00492
00493 template <class T>
00494 EFeatureClass CStreamingStringFeatures<T>::get_feature_class()
00495 {
00496 return C_STREAMING_STRING;
00497 }
00498
00499 }
00500 #endif // _STREAMING_STRINGFEATURES__H__