Go to the documentation of this file.00001 #include <shogun/features/streaming/StreamingStringFeatures.h>
00002
00003 namespace shogun
00004 {
00005
00006
00007 template <class T>
00008 CStreamingStringFeatures<T>::CStreamingStringFeatures() : CStreamingFeatures()
00009 {
00010 init();
00011 set_read_functions();
00012 remap_to_bin=false;
00013 }
00014
00015 template <class T>
00016 CStreamingStringFeatures<T>::CStreamingStringFeatures(CStreamingFile* file,
00017 bool is_labelled,
00018 int32_t size)
00019 : CStreamingFeatures()
00020 {
00021 init(file, is_labelled, size);
00022 set_read_functions();
00023 remap_to_bin=false;
00024 }
00025
00026 template <class T>
00027 CStreamingStringFeatures<T>::~CStreamingStringFeatures()
00028 {
00029 parser.end_parser();
00030 SG_UNREF(alphabet);
00031 }
00032
00033 template <class T>
00034 void CStreamingStringFeatures<T>::use_alphabet(EAlphabet alpha)
00035 {
00036 SG_UNREF(alphabet);
00037
00038 alphabet=new CAlphabet(alpha);
00039 SG_REF(alphabet);
00040 num_symbols=alphabet->get_num_symbols();
00041 }
00042
00043 template <class T>
00044 void CStreamingStringFeatures<T>::use_alphabet(CAlphabet* alpha)
00045 {
00046 SG_UNREF(alphabet);
00047
00048 alphabet=new CAlphabet(alpha);
00049 SG_REF(alphabet);
00050 num_symbols=alphabet->get_num_symbols();
00051 }
00052
00053 template <class T>
00054 void CStreamingStringFeatures<T>::set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet)
00055 {
00056 remap_to_bin=true;
00057 alpha_ascii=new CAlphabet(ascii_alphabet);
00058 alpha_bin=new CAlphabet(binary_alphabet);
00059 }
00060
00061 template <class T>
00062 void CStreamingStringFeatures<T>::set_remap(EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00063 {
00064 remap_to_bin=true;
00065 alpha_ascii=new CAlphabet(ascii_alphabet);
00066 alpha_bin=new CAlphabet(binary_alphabet);
00067 }
00068
00069 template <class T>
00070 CAlphabet* CStreamingStringFeatures<T>::get_alphabet()
00071 {
00072 SG_REF(alphabet);
00073 return alphabet;
00074 }
00075
00076 template <class T>
00077 floatmax_t CStreamingStringFeatures<T>::get_num_symbols()
00078 {
00079 return num_symbols;
00080 }
00081
00082 template <class T>
00083 CFeatures* CStreamingStringFeatures<T>::duplicate() const
00084 {
00085 return new CStreamingStringFeatures<T>(*this);
00086 }
00087
00088 template <class T>
00089 int32_t CStreamingStringFeatures<T>::get_num_vectors() const
00090 {
00091 if (current_string)
00092 return 1;
00093 return 0;
00094 }
00095
00096 template <class T>
00097 int32_t CStreamingStringFeatures<T>::get_size() const
00098 {
00099 return sizeof(T);
00100 }
00101
00102 template <class T>
00103 int32_t CStreamingStringFeatures<T>::get_num_features()
00104 {
00105 return current_length;
00106 }
00107
00108 template <class T> void CStreamingStringFeatures<T>::set_vector_reader()
00109 {
00110 parser.set_read_vector(&CStreamingFile::get_string);
00111 }
00112
00113 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader()
00114 {
00115 parser.set_read_vector_and_label
00116 (&CStreamingFile::get_string_and_label);
00117 }
00118
00119 #define GET_FEATURE_TYPE(f_type, sg_type) \
00120 template<> EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() const \
00121 { \
00122 return f_type; \
00123 }
00124
00125 GET_FEATURE_TYPE(F_BOOL, bool)
00126 GET_FEATURE_TYPE(F_CHAR, char)
00127 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00128 GET_FEATURE_TYPE(F_BYTE, int8_t)
00129 GET_FEATURE_TYPE(F_SHORT, int16_t)
00130 GET_FEATURE_TYPE(F_WORD, uint16_t)
00131 GET_FEATURE_TYPE(F_INT, int32_t)
00132 GET_FEATURE_TYPE(F_UINT, uint32_t)
00133 GET_FEATURE_TYPE(F_LONG, int64_t)
00134 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00135 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00136 GET_FEATURE_TYPE(F_DREAL, float64_t)
00137 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00138 #undef GET_FEATURE_TYPE
00139
00140
00141 template <class T>
00142 void CStreamingStringFeatures<T>::init()
00143 {
00144 working_file=NULL;
00145 alphabet=new CAlphabet();
00146
00147 current_string=NULL;
00148 current_length=-1;
00149 current_sgstring.string=current_string;
00150 current_sgstring.slen=current_length;
00151 }
00152
00153 template <class T>
00154 void CStreamingStringFeatures<T>::init(CStreamingFile* file,
00155 bool is_labelled,
00156 int32_t size)
00157 {
00158 init();
00159 has_labels=is_labelled;
00160 working_file=file;
00161 parser.init(file, is_labelled, size);
00162 parser.set_free_vector_after_release(false);
00163 parser.set_free_vectors_on_destruct(false);
00164 }
00165
00166 template <class T>
00167 void CStreamingStringFeatures<T>::start_parser()
00168 {
00169 if (!remap_to_bin)
00170 alpha_ascii=alphabet;
00171
00172 if (!parser.is_running())
00173 parser.start_parser();
00174 }
00175
00176 template <class T>
00177 void CStreamingStringFeatures<T>::end_parser()
00178 {
00179 parser.end_parser();
00180 }
00181
00182 template <class T>
00183 bool CStreamingStringFeatures<T>::get_next_example()
00184 {
00185 bool ret_value;
00186
00187 ret_value = (bool) parser.get_next_example(current_string,
00188 current_length,
00189 current_label);
00190
00191 if (!ret_value)
00192 return false;
00193
00194 int32_t i;
00195 if (remap_to_bin)
00196 {
00197 alpha_ascii->add_string_to_histogram(current_string, current_length);
00198
00199 for (i=0; i<current_length; i++)
00200 current_string[i]=alpha_ascii->remap_to_bin(current_string[i]);
00201 alpha_bin->add_string_to_histogram(current_string, current_length);
00202 }
00203 else
00204 {
00205 alpha_ascii->add_string_to_histogram(current_string, current_length);
00206 }
00207
00208
00209 if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) )
00210 {
00211 SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n");
00212 return 0;
00213 }
00214
00215
00216
00217 if (remap_to_bin)
00218 alphabet=alpha_bin;
00219 else
00220 alphabet=alpha_ascii;
00221
00222
00223 num_symbols=alphabet->get_num_symbols();
00224
00225 return ret_value;
00226 }
00227
00228 template <class T>
00229 SGString<T> CStreamingStringFeatures<T>::get_vector()
00230 {
00231 current_sgstring.string=current_string;
00232 current_sgstring.slen=current_length;
00233
00234 return current_sgstring;
00235 }
00236
00237 template <class T>
00238 float64_t CStreamingStringFeatures<T>::get_label()
00239 {
00240 ASSERT(has_labels);
00241
00242 return current_label;
00243 }
00244
00245 template <class T>
00246 void CStreamingStringFeatures<T>::release_example()
00247 {
00248 parser.finalize_example();
00249 }
00250
00251 template <class T>
00252 int32_t CStreamingStringFeatures<T>::get_vector_length()
00253 {
00254 return current_length;
00255 }
00256
00257 template <class T>
00258 EFeatureClass CStreamingStringFeatures<T>::get_feature_class() const
00259 {
00260 return C_STREAMING_STRING;
00261 }
00262
00263 template class CStreamingStringFeatures<bool>;
00264 template class CStreamingStringFeatures<char>;
00265 template class CStreamingStringFeatures<int8_t>;
00266 template class CStreamingStringFeatures<uint8_t>;
00267 template class CStreamingStringFeatures<int16_t>;
00268 template class CStreamingStringFeatures<uint16_t>;
00269 template class CStreamingStringFeatures<int32_t>;
00270 template class CStreamingStringFeatures<uint32_t>;
00271 template class CStreamingStringFeatures<int64_t>;
00272 template class CStreamingStringFeatures<uint64_t>;
00273 template class CStreamingStringFeatures<float32_t>;
00274 template class CStreamingStringFeatures<float64_t>;
00275 template class CStreamingStringFeatures<floatmax_t>;
00276
00277 }