Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014
00015 #include <shogun/base/Parameter.h>
00016
00017 #include <shogun/distributions/LinearHMM.h>
00018 #include <shogun/features/StringFeatures.h>
00019
00020 using namespace shogun;
00021
00022 CLinearHMM::CLinearHMM() : CDistribution()
00023 {
00024 init();
00025 }
00026
00027 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00028 : CDistribution()
00029 {
00030 init();
00031
00032 set_features(f);
00033 sequence_length = f->get_vector_length(0);
00034 num_symbols = (int32_t) f->get_num_symbols();
00035 num_params = sequence_length*num_symbols;
00036 }
00037
00038 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00039 : CDistribution()
00040 {
00041 init();
00042
00043 sequence_length = p_num_features;
00044 num_symbols = p_num_symbols;
00045 num_params = sequence_length*num_symbols;
00046 }
00047
00048 CLinearHMM::~CLinearHMM()
00049 {
00050 SG_FREE(transition_probs);
00051 SG_FREE(log_transition_probs);
00052 }
00053
00054 bool CLinearHMM::train(CFeatures* data)
00055 {
00056 if (data)
00057 {
00058 if (data->get_feature_class() != C_STRING ||
00059 data->get_feature_type() != F_WORD)
00060 {
00061 SG_ERROR("Expected features of class string type word!\n");
00062 }
00063 set_features(data);
00064 }
00065 SG_FREE(transition_probs);
00066 SG_FREE(log_transition_probs);
00067 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params);
00068
00069 int32_t vec;
00070 int32_t i;
00071
00072 for (i=0; i< num_params; i++)
00073 int_transition_probs[i]=0;
00074
00075 for (vec=0; vec<features->get_num_vectors(); vec++)
00076 {
00077 int32_t len;
00078 bool free_vec;
00079
00080 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00081 get_feature_vector(vec, len, free_vec);
00082
00083
00084 for (int32_t feat=0; feat<len ; feat++)
00085 int_transition_probs[feat*num_symbols+vector[feat]]++;
00086
00087 ((CStringFeatures<uint16_t>*) features)->
00088 free_feature_vector(vector, vec, free_vec);
00089 }
00090
00091
00092 transition_probs=SG_MALLOC(float64_t, num_params);
00093 log_transition_probs=SG_MALLOC(float64_t, num_params);
00094
00095 for (i=0;i<sequence_length;i++)
00096 {
00097 for (int32_t j=0; j<num_symbols; j++)
00098 {
00099 float64_t sum=0;
00100 int32_t offs=i*num_symbols+
00101 ((CStringFeatures<uint16_t> *) features)->
00102 get_masked_symbols((uint16_t)j,(uint8_t) 254);
00103 int32_t original_num_symbols=(int32_t)
00104 ((CStringFeatures<uint16_t> *) features)->
00105 get_original_num_symbols();
00106
00107 for (int32_t k=0; k<original_num_symbols; k++)
00108 sum+=int_transition_probs[offs+k];
00109
00110 transition_probs[i*num_symbols+j]=
00111 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00112 (sum+((CStringFeatures<uint16_t> *) features)->
00113 get_original_num_symbols()*pseudo_count);
00114 log_transition_probs[i*num_symbols+j]=
00115 log(transition_probs[i*num_symbols+j]);
00116 }
00117 }
00118
00119 SG_FREE(int_transition_probs);
00120 return true;
00121 }
00122
00123 bool CLinearHMM::train(
00124 const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00125 {
00126 SG_FREE(transition_probs);
00127 SG_FREE(log_transition_probs);
00128 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params);
00129 int32_t vec;
00130 int32_t i;
00131
00132 for (i=0; i< num_params; i++)
00133 int_transition_probs[i]=0;
00134
00135 for (vec=0; vec<num_indizes; vec++)
00136 {
00137 int32_t len;
00138 bool free_vec;
00139
00140 ASSERT(indizes[vec]>=0 &&
00141 indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00142 get_num_vectors());
00143 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00144 get_feature_vector(indizes[vec], len, free_vec);
00145 ((CStringFeatures<uint16_t>*) features)->
00146 free_feature_vector(vector, indizes[vec], free_vec);
00147
00148
00149
00150 for (int32_t feat=0; feat<len ; feat++)
00151 int_transition_probs[feat*num_symbols+vector[feat]]++;
00152 }
00153
00154
00155 transition_probs=SG_MALLOC(float64_t, num_params);
00156 log_transition_probs=SG_MALLOC(float64_t, num_params);
00157
00158 for (i=0;i<sequence_length;i++)
00159 {
00160 for (int32_t j=0; j<num_symbols; j++)
00161 {
00162 float64_t sum=0;
00163 int32_t original_num_symbols=(int32_t)
00164 ((CStringFeatures<uint16_t> *) features)->
00165 get_original_num_symbols();
00166 for (int32_t k=0; k<original_num_symbols; k++)
00167 {
00168 sum+=int_transition_probs[i*num_symbols+
00169 ((CStringFeatures<uint16_t>*) features)->
00170 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00171 }
00172
00173 transition_probs[i*num_symbols+j]=
00174 (int_transition_probs[i*num_symbols+j]+pseudo)/
00175 (sum+((CStringFeatures<uint16_t>*) features)->
00176 get_original_num_symbols()*pseudo);
00177 log_transition_probs[i*num_symbols+j]=
00178 log(transition_probs[i*num_symbols+j]);
00179 }
00180 }
00181
00182 SG_FREE(int_transition_probs);
00183 return true;
00184 }
00185
00186 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00187 {
00188 float64_t result=log_transition_probs[vector[0]];
00189
00190 for (int32_t i=1; i<len; i++)
00191 result+=log_transition_probs[i*num_symbols+vector[i]];
00192
00193 return result;
00194 }
00195
00196 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00197 {
00198 int32_t len;
00199 bool free_vec;
00200 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00201 get_feature_vector(num_example, len, free_vec);
00202 float64_t result=log_transition_probs[vector[0]];
00203
00204 for (int32_t i=1; i<len; i++)
00205 result+=log_transition_probs[i*num_symbols+vector[i]];
00206
00207 ((CStringFeatures<uint16_t>*) features)->
00208 free_feature_vector(vector, num_example, free_vec);
00209
00210 return result;
00211 }
00212
00213 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00214 {
00215 float64_t result=transition_probs[vector[0]];
00216
00217 for (int32_t i=1; i<len; i++)
00218 result*=transition_probs[i*num_symbols+vector[i]];
00219
00220 return result;
00221 }
00222
00223 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00224 {
00225 int32_t len;
00226 bool free_vec;
00227 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00228 get_feature_vector(num_example, len, free_vec);
00229 float64_t result=0;
00230 int32_t position=num_param/num_symbols;
00231 ASSERT(position>=0 && position<len);
00232 uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00233
00234 if (vector[position]==sym && transition_probs[num_param]!=0)
00235 result=1.0/transition_probs[num_param];
00236 ((CStringFeatures<uint16_t>*) features)->
00237 free_feature_vector(vector, num_example, free_vec);
00238
00239 return result;
00240 }
00241
00242 SGVector<float64_t> CLinearHMM::get_transition_probs()
00243 {
00244 return SGVector<float64_t>(transition_probs, num_params);
00245 }
00246
00247 bool CLinearHMM::set_transition_probs(SGVector<float64_t> probs)
00248 {
00249 ASSERT(probs.vlen == num_params);
00250
00251 if (!log_transition_probs)
00252 log_transition_probs=SG_MALLOC(float64_t, num_params);
00253
00254 if (!transition_probs)
00255 transition_probs=SG_MALLOC(float64_t, num_params);
00256
00257 for (int32_t i=0; i<num_params; i++)
00258 {
00259 transition_probs[i]=probs.vector[i];
00260 log_transition_probs[i]=log(transition_probs[i]);
00261 }
00262
00263 return true;
00264 }
00265
00266 SGVector<float64_t> CLinearHMM::get_log_transition_probs()
00267 {
00268 return SGVector<float64_t>(log_transition_probs, num_params);
00269 }
00270
00271 bool CLinearHMM::set_log_transition_probs(SGVector<float64_t> probs)
00272 {
00273 ASSERT(probs.vlen == num_params);
00274
00275 if (!log_transition_probs)
00276 log_transition_probs=SG_MALLOC(float64_t, num_params);
00277
00278 if (!transition_probs)
00279 transition_probs=SG_MALLOC(float64_t, num_params);
00280
00281 for (int32_t i=0; i<num_params; i++)
00282 {
00283 log_transition_probs[i]=probs.vector[i];
00284 transition_probs[i]=exp(log_transition_probs[i]);
00285 }
00286
00287 return true;
00288 }
00289
00290 void CLinearHMM::load_serializable_post() throw (ShogunException)
00291 {
00292 CSGObject::load_serializable_post();
00293
00294 num_params = sequence_length*num_symbols;
00295 }
00296
00297 void CLinearHMM::init()
00298 {
00299 sequence_length = 0;
00300 num_symbols = 0;
00301 num_params = 0;
00302 transition_probs = NULL;
00303 log_transition_probs = NULL;
00304
00305 m_parameters->add_matrix(&transition_probs, &num_symbols, &sequence_length,
00306 "transition_probs", "Transition probabilities.");
00307 m_parameters->add_matrix(&log_transition_probs, &num_symbols, &sequence_length,
00308 "log_transition_probs", "Transition probabilities (logspace).");
00309 }