00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Shashwat Lal Das 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 #ifndef _STREAMING_STRINGFEATURES__H__ 00011 #define _STREAMING_STRINGFEATURES__H__ 00012 00013 #include <shogun/lib/common.h> 00014 #include <shogun/mathematics/Math.h> 00015 #include <shogun/base/Parameter.h> 00016 #include <shogun/lib/DataType.h> 00017 #include <shogun/io/streaming/InputParser.h> 00018 00019 #include <shogun/features/streaming/StreamingFeatures.h> 00020 #include <shogun/features/Alphabet.h> 00021 00022 namespace shogun 00023 { 00027 template <class T> class CStreamingStringFeatures : public CStreamingFeatures 00028 { 00029 public: 00030 00038 CStreamingStringFeatures(); 00039 00048 CStreamingStringFeatures(CStreamingFile* file, 00049 bool is_labelled, 00050 int32_t size); 00051 00057 virtual ~CStreamingStringFeatures(); 00058 00068 virtual void set_vector_reader(); 00069 00079 virtual void set_vector_and_label_reader(); 00080 00087 void use_alphabet(EAlphabet alpha); 00088 00095 void use_alphabet(CAlphabet* alpha); 00096 00104 void set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet); 00105 00113 void set_remap(EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA); 00114 00119 CAlphabet* get_alphabet(); 00120 00127 floatmax_t get_num_symbols(); 00128 00134 virtual void start_parser(); 00135 00141 virtual void end_parser(); 00142 00151 virtual bool get_next_example(); 00152 00158 SGString<T> get_vector(); 00159 00167 virtual float64_t get_label(); 00168 00175 virtual void release_example(); 00176 00182 virtual int32_t get_vector_length(); 00183 00189 virtual EFeatureType get_feature_type() const; 00190 00196 virtual EFeatureClass get_feature_class() const; 00197 00203 virtual CFeatures* duplicate() const; 00204 00210 virtual const char* get_name() const { return "StreamingStringFeatures"; } 00211 00217 virtual int32_t get_num_vectors() const; 00218 00224 virtual int32_t get_size() const; 00225 00231 virtual int32_t get_num_features(); 00232 00233 private: 00234 00239 void init(); 00240 00248 void init(CStreamingFile *file, bool is_labelled, int32_t size); 00249 00250 protected: 00251 00253 CInputParser<T> parser; 00254 00256 CAlphabet* alphabet; 00257 00259 CAlphabet* alpha_ascii; 00260 00262 CAlphabet* alpha_bin; 00263 00265 CStreamingFile* working_file; 00266 00268 SGString<T> current_sgstring; 00269 00271 T* current_string; 00272 00274 int32_t current_length; 00275 00277 float64_t current_label; 00278 00280 bool has_labels; 00281 00283 bool remap_to_bin; 00284 00286 int32_t num_symbols; 00287 }; 00288 00289 } 00290 #endif // _STREAMING_STRINGFEATURES__H__