Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014
00015 #include "base/Parameter.h"
00016
00017 #include "distributions/LinearHMM.h"
00018 #include "features/StringFeatures.h"
00019
00020 using namespace shogun;
00021
00022 CLinearHMM::CLinearHMM() : CDistribution()
00023 {
00024 init();
00025 }
00026
00027 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00028 : CDistribution()
00029 {
00030 init();
00031
00032 set_features(f);
00033 sequence_length = f->get_vector_length(0);
00034 num_symbols = (int32_t) f->get_num_symbols();
00035 num_params = sequence_length*num_symbols;
00036 }
00037
00038 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00039 : CDistribution()
00040 {
00041 init();
00042
00043 sequence_length = p_num_features;
00044 num_symbols = p_num_symbols;
00045 num_params = sequence_length*num_symbols;
00046 }
00047
00048 CLinearHMM::~CLinearHMM()
00049 {
00050 delete[] transition_probs;
00051 delete[] log_transition_probs;
00052 }
00053
00054 bool CLinearHMM::train(CFeatures* data)
00055 {
00056 if (data)
00057 {
00058 if (data->get_feature_class() != C_STRING ||
00059 data->get_feature_type() != F_WORD)
00060 {
00061 SG_ERROR("Expected features of class string type word!\n");
00062 }
00063 set_features(data);
00064 }
00065 delete[] transition_probs;
00066 delete[] log_transition_probs;
00067 int32_t* int_transition_probs=new int32_t[num_params];
00068
00069 int32_t vec;
00070 int32_t i;
00071
00072 for (i=0; i< num_params; i++)
00073 int_transition_probs[i]=0;
00074
00075 for (vec=0; vec<features->get_num_vectors(); vec++)
00076 {
00077 int32_t len;
00078 bool free_vec;
00079
00080 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00081 get_feature_vector(vec, len, free_vec);
00082
00083
00084 for (int32_t feat=0; feat<len ; feat++)
00085 int_transition_probs[feat*num_symbols+vector[feat]]++;
00086
00087 ((CStringFeatures<uint16_t>*) features)->
00088 free_feature_vector(vector, vec, free_vec);
00089 }
00090
00091
00092 transition_probs=new float64_t[num_params];
00093 log_transition_probs=new float64_t[num_params];
00094
00095 for (i=0;i<sequence_length;i++)
00096 {
00097 for (int32_t j=0; j<num_symbols; j++)
00098 {
00099 float64_t sum=0;
00100 int32_t offs=i*num_symbols+
00101 ((CStringFeatures<uint16_t> *) features)->
00102 get_masked_symbols((uint16_t)j,(uint8_t) 254);
00103 int32_t original_num_symbols=(int32_t)
00104 ((CStringFeatures<uint16_t> *) features)->
00105 get_original_num_symbols();
00106
00107 for (int32_t k=0; k<original_num_symbols; k++)
00108 sum+=int_transition_probs[offs+k];
00109
00110 transition_probs[i*num_symbols+j]=
00111 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00112 (sum+((CStringFeatures<uint16_t> *) features)->
00113 get_original_num_symbols()*pseudo_count);
00114 log_transition_probs[i*num_symbols+j]=
00115 log(transition_probs[i*num_symbols+j]);
00116 }
00117 }
00118
00119 delete[] int_transition_probs;
00120 return true;
00121 }
00122
00123 bool CLinearHMM::train(
00124 const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00125 {
00126 delete[] transition_probs;
00127 delete[] log_transition_probs;
00128 int32_t* int_transition_probs=new int32_t[num_params];
00129 int32_t vec;
00130 int32_t i;
00131
00132 for (i=0; i< num_params; i++)
00133 int_transition_probs[i]=0;
00134
00135 for (vec=0; vec<num_indizes; vec++)
00136 {
00137 int32_t len;
00138 bool free_vec;
00139
00140 ASSERT(indizes[vec]>=0 &&
00141 indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00142 get_num_vectors());
00143 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00144 get_feature_vector(indizes[vec], len, free_vec);
00145 ((CStringFeatures<uint16_t>*) features)->
00146 free_feature_vector(vector, indizes[vec], free_vec);
00147
00148
00149
00150 for (int32_t feat=0; feat<len ; feat++)
00151 int_transition_probs[feat*num_symbols+vector[feat]]++;
00152 }
00153
00154
00155 transition_probs=new float64_t[num_params];
00156 log_transition_probs=new float64_t[num_params];
00157
00158 for (i=0;i<sequence_length;i++)
00159 {
00160 for (int32_t j=0; j<num_symbols; j++)
00161 {
00162 float64_t sum=0;
00163 int32_t original_num_symbols=(int32_t)
00164 ((CStringFeatures<uint16_t> *) features)->
00165 get_original_num_symbols();
00166 for (int32_t k=0; k<original_num_symbols; k++)
00167 {
00168 sum+=int_transition_probs[i*num_symbols+
00169 ((CStringFeatures<uint16_t>*) features)->
00170 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00171 }
00172
00173 transition_probs[i*num_symbols+j]=
00174 (int_transition_probs[i*num_symbols+j]+pseudo)/
00175 (sum+((CStringFeatures<uint16_t>*) features)->
00176 get_original_num_symbols()*pseudo);
00177 log_transition_probs[i*num_symbols+j]=
00178 log(transition_probs[i*num_symbols+j]);
00179 }
00180 }
00181
00182 delete[] int_transition_probs;
00183 return true;
00184 }
00185
00186 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00187 {
00188 float64_t result=log_transition_probs[vector[0]];
00189
00190 for (int32_t i=1; i<len; i++)
00191 result+=log_transition_probs[i*num_symbols+vector[i]];
00192
00193 return result;
00194 }
00195
00196 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00197 {
00198 int32_t len;
00199 bool free_vec;
00200 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00201 get_feature_vector(num_example, len, free_vec);
00202 float64_t result=log_transition_probs[vector[0]];
00203
00204 for (int32_t i=1; i<len; i++)
00205 result+=log_transition_probs[i*num_symbols+vector[i]];
00206
00207 ((CStringFeatures<uint16_t>*) features)->
00208 free_feature_vector(vector, num_example, free_vec);
00209
00210 return result;
00211 }
00212
00213 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00214 {
00215 float64_t result=transition_probs[vector[0]];
00216
00217 for (int32_t i=1; i<len; i++)
00218 result*=transition_probs[i*num_symbols+vector[i]];
00219
00220 return result;
00221 }
00222
00223 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00224 {
00225 int32_t len;
00226 bool free_vec;
00227 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00228 get_feature_vector(num_example, len, free_vec);
00229 float64_t result=0;
00230 int32_t position=num_param/num_symbols;
00231 ASSERT(position>=0 && position<len);
00232 uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00233
00234 if (vector[position]==sym && transition_probs[num_param]!=0)
00235 result=1.0/transition_probs[num_param];
00236 ((CStringFeatures<uint16_t>*) features)->
00237 free_feature_vector(vector, num_example, free_vec);
00238
00239 return result;
00240 }
00241
00242 void CLinearHMM::get_transition_probs(float64_t** dst, int32_t* num)
00243 {
00244 *num=num_params;
00245 size_t sz=sizeof(*transition_probs)*(*num);
00246 *dst=(float64_t*) malloc(sz);
00247 ASSERT(dst);
00248
00249 memcpy(*dst, transition_probs, sz);
00250 }
00251
00252 bool CLinearHMM::set_transition_probs(const float64_t* src, int32_t num)
00253 {
00254 if (num!=-1)
00255 ASSERT(num==num_params);
00256
00257 if (!log_transition_probs)
00258 log_transition_probs=new float64_t[num_params];
00259
00260 if (!transition_probs)
00261 transition_probs=new float64_t[num_params];
00262
00263 for (int32_t i=0; i<num_params; i++)
00264 {
00265 transition_probs[i]=src[i];
00266 log_transition_probs[i]=log(transition_probs[i]);
00267 }
00268
00269 return true;
00270 }
00271
00272 void CLinearHMM::get_log_transition_probs(float64_t** dst, int32_t* num)
00273 {
00274 *num=num_params;
00275 size_t sz=sizeof(*log_transition_probs)*(*num);
00276 *dst=(float64_t*) malloc(sz);
00277 ASSERT(dst);
00278
00279 memcpy(*dst, log_transition_probs, sz);
00280 }
00281
00282 bool CLinearHMM::set_log_transition_probs(const float64_t* src, int32_t num)
00283 {
00284 if (num!=-1)
00285 ASSERT(num==num_params);
00286
00287 if (!log_transition_probs)
00288 log_transition_probs=new float64_t[num_params];
00289
00290 if (!transition_probs)
00291 transition_probs=new float64_t[num_params];
00292
00293 for (int32_t i=0; i< num_params; i++)
00294 {
00295 log_transition_probs[i]=src[i];
00296 transition_probs[i]=exp(log_transition_probs[i]);
00297 }
00298
00299 return true;
00300 }
00301
00302 void CLinearHMM::load_serializable_post() throw (ShogunException)
00303 {
00304 CSGObject::load_serializable_post();
00305
00306 num_params = sequence_length*num_symbols;
00307 }
00308
00309 void CLinearHMM::init()
00310 {
00311 sequence_length = 0;
00312 num_symbols = 0;
00313 num_params = 0;
00314 transition_probs = NULL;
00315 log_transition_probs = NULL;
00316
00317 m_parameters->add_matrix(&transition_probs, &num_symbols, &sequence_length,
00318 "transition_probs", "Transition probabilities.");
00319 m_parameters->add_matrix(&log_transition_probs, &num_symbols, &sequence_length,
00320 "log_transition_probs", "Transition probabilities (logspace).");
00321 }