StreamingStringFeatures.cpp

Go to the documentation of this file.
00001 #include <shogun/features/streaming/StreamingStringFeatures.h>
00002 
00003 namespace shogun
00004 {
00005 
00006 
00007 template <class T>
00008 CStreamingStringFeatures<T>::CStreamingStringFeatures() : CStreamingFeatures()
00009 {
00010     init();
00011     set_read_functions();
00012     remap_to_bin=false;
00013 }
00014 
00015 template <class T>
00016 CStreamingStringFeatures<T>::CStreamingStringFeatures(CStreamingFile* file,
00017              bool is_labelled,
00018              int32_t size)
00019     : CStreamingFeatures()
00020 {
00021     init(file, is_labelled, size);
00022     set_read_functions();
00023     remap_to_bin=false;
00024 }
00025 
00026 template <class T>
00027 CStreamingStringFeatures<T>::~CStreamingStringFeatures()
00028 {
00029     parser.end_parser();
00030     SG_UNREF(alphabet);
00031 }
00032 
00033 template <class T>
00034 void CStreamingStringFeatures<T>::use_alphabet(EAlphabet alpha)
00035 {
00036     SG_UNREF(alphabet);
00037 
00038     alphabet=new CAlphabet(alpha);
00039     SG_REF(alphabet);
00040     num_symbols=alphabet->get_num_symbols();
00041 }
00042 
00043 template <class T>
00044 void CStreamingStringFeatures<T>::use_alphabet(CAlphabet* alpha)
00045 {
00046     SG_UNREF(alphabet);
00047 
00048     alphabet=new CAlphabet(alpha);
00049     SG_REF(alphabet);
00050     num_symbols=alphabet->get_num_symbols();
00051 }
00052 
00053 template <class T>
00054 void CStreamingStringFeatures<T>::set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet)
00055 {
00056     remap_to_bin=true;
00057     alpha_ascii=new CAlphabet(ascii_alphabet);
00058     alpha_bin=new CAlphabet(binary_alphabet);
00059 }
00060 
00061 template <class T>
00062 void CStreamingStringFeatures<T>::set_remap(EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00063 {
00064     remap_to_bin=true;
00065     alpha_ascii=new CAlphabet(ascii_alphabet);
00066     alpha_bin=new CAlphabet(binary_alphabet);
00067 }
00068 
00069 template <class T>
00070 CAlphabet* CStreamingStringFeatures<T>::get_alphabet()
00071 {
00072     SG_REF(alphabet);
00073     return alphabet;
00074 }
00075 
00076 template <class T>
00077 floatmax_t CStreamingStringFeatures<T>::get_num_symbols()
00078 {
00079     return num_symbols;
00080 }
00081 
00082 template <class T>
00083 CFeatures* CStreamingStringFeatures<T>::duplicate() const
00084 {
00085     return new CStreamingStringFeatures<T>(*this);
00086 }
00087 
00088 template <class T>
00089 int32_t CStreamingStringFeatures<T>::get_num_vectors() const
00090 {
00091     if (current_string)
00092         return 1;
00093     return 0;
00094 }
00095 
00096 template <class T>
00097 int32_t CStreamingStringFeatures<T>::get_size() const
00098 {
00099     return sizeof(T);
00100 }
00101 
00102 template <class T>
00103 int32_t CStreamingStringFeatures<T>::get_num_features()
00104 {
00105     return current_length;
00106 }
00107 
00108 template <class T> void CStreamingStringFeatures<T>::set_vector_reader()
00109 {
00110     parser.set_read_vector(&CStreamingFile::get_string);
00111 }
00112 
00113 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader()
00114 {
00115     parser.set_read_vector_and_label
00116         (&CStreamingFile::get_string_and_label);
00117 }
00118 
00119 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00120 template<> EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() const \
00121 {                                   \
00122     return f_type;                          \
00123 }
00124 
00125 GET_FEATURE_TYPE(F_BOOL, bool)
00126 GET_FEATURE_TYPE(F_CHAR, char)
00127 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00128 GET_FEATURE_TYPE(F_BYTE, int8_t)
00129 GET_FEATURE_TYPE(F_SHORT, int16_t)
00130 GET_FEATURE_TYPE(F_WORD, uint16_t)
00131 GET_FEATURE_TYPE(F_INT, int32_t)
00132 GET_FEATURE_TYPE(F_UINT, uint32_t)
00133 GET_FEATURE_TYPE(F_LONG, int64_t)
00134 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00135 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00136 GET_FEATURE_TYPE(F_DREAL, float64_t)
00137 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00138 #undef GET_FEATURE_TYPE
00139 
00140 
00141 template <class T>
00142 void CStreamingStringFeatures<T>::init()
00143 {
00144     working_file=NULL;
00145     alphabet=new CAlphabet();
00146 
00147     current_string=NULL;
00148     current_length=-1;
00149     current_sgstring.string=current_string;
00150     current_sgstring.slen=current_length;
00151 }
00152 
00153 template <class T>
00154 void CStreamingStringFeatures<T>::init(CStreamingFile* file,
00155                        bool is_labelled,
00156                        int32_t size)
00157 {
00158     init();
00159     has_labels=is_labelled;
00160     working_file=file;
00161     parser.init(file, is_labelled, size);
00162     parser.set_free_vector_after_release(false);
00163     parser.set_free_vectors_on_destruct(false);
00164 }
00165 
00166 template <class T>
00167 void CStreamingStringFeatures<T>::start_parser()
00168 {
00169     if (!remap_to_bin)
00170         alpha_ascii=alphabet;
00171 
00172     if (!parser.is_running())
00173         parser.start_parser();
00174 }
00175 
00176 template <class T>
00177 void CStreamingStringFeatures<T>::end_parser()
00178 {
00179     parser.end_parser();
00180 }
00181 
00182 template <class T>
00183 bool CStreamingStringFeatures<T>::get_next_example()
00184 {
00185     bool ret_value;
00186 
00187     ret_value = (bool) parser.get_next_example(current_string,
00188                            current_length,
00189                            current_label);
00190 
00191     if (!ret_value)
00192         return false;
00193 
00194     int32_t i;
00195     if (remap_to_bin)
00196     {
00197         alpha_ascii->add_string_to_histogram(current_string, current_length);
00198 
00199         for (i=0; i<current_length; i++)
00200             current_string[i]=alpha_ascii->remap_to_bin(current_string[i]);
00201         alpha_bin->add_string_to_histogram(current_string, current_length);
00202     }
00203     else
00204     {
00205         alpha_ascii->add_string_to_histogram(current_string, current_length);
00206     }
00207 
00208     /* Check the input using src alphabet, alpha_ascii */
00209     if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) )
00210     {
00211         SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n");
00212         return 0;
00213     }
00214 
00215     //SG_UNREF(alphabet);
00216 
00217     if (remap_to_bin)
00218         alphabet=alpha_bin;
00219     else
00220         alphabet=alpha_ascii;
00221 
00222     //SG_REF(alphabet);
00223     num_symbols=alphabet->get_num_symbols();
00224 
00225     return ret_value;
00226 }
00227 
00228 template <class T>
00229 SGString<T> CStreamingStringFeatures<T>::get_vector()
00230 {
00231     current_sgstring.string=current_string;
00232     current_sgstring.slen=current_length;
00233 
00234     return current_sgstring;
00235 }
00236 
00237 template <class T>
00238 float64_t CStreamingStringFeatures<T>::get_label()
00239 {
00240     ASSERT(has_labels);
00241 
00242     return current_label;
00243 }
00244 
00245 template <class T>
00246 void CStreamingStringFeatures<T>::release_example()
00247 {
00248     parser.finalize_example();
00249 }
00250 
00251 template <class T>
00252 int32_t CStreamingStringFeatures<T>::get_vector_length()
00253 {
00254     return current_length;
00255 }
00256 
00257 template <class T>
00258 EFeatureClass CStreamingStringFeatures<T>::get_feature_class() const
00259 {
00260     return C_STREAMING_STRING;
00261 }
00262 
00263 template class CStreamingStringFeatures<bool>;
00264 template class CStreamingStringFeatures<char>;
00265 template class CStreamingStringFeatures<int8_t>;
00266 template class CStreamingStringFeatures<uint8_t>;
00267 template class CStreamingStringFeatures<int16_t>;
00268 template class CStreamingStringFeatures<uint16_t>;
00269 template class CStreamingStringFeatures<int32_t>;
00270 template class CStreamingStringFeatures<uint32_t>;
00271 template class CStreamingStringFeatures<int64_t>;
00272 template class CStreamingStringFeatures<uint64_t>;
00273 template class CStreamingStringFeatures<float32_t>;
00274 template class CStreamingStringFeatures<float64_t>;
00275 template class CStreamingStringFeatures<floatmax_t>;
00276 
00277 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation