LinearHMM.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 
00015 #include <shogun/base/Parameter.h>
00016 
00017 #include <shogun/distributions/LinearHMM.h>
00018 #include <shogun/features/StringFeatures.h>
00019 
00020 using namespace shogun;
00021 
00022 CLinearHMM::CLinearHMM() : CDistribution()
00023 {
00024     init();
00025 }
00026 
00027 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00028 : CDistribution()
00029 {
00030     init();
00031 
00032     set_features(f);
00033     sequence_length = f->get_vector_length(0);
00034     num_symbols     = (int32_t) f->get_num_symbols();
00035     num_params      = sequence_length*num_symbols;
00036 }
00037 
00038 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00039 : CDistribution()
00040 {
00041     init();
00042 
00043     sequence_length = p_num_features;
00044     num_symbols     = p_num_symbols;
00045     num_params      = sequence_length*num_symbols;
00046 }
00047 
00048 CLinearHMM::~CLinearHMM()
00049 {
00050     SG_FREE(transition_probs);
00051     SG_FREE(log_transition_probs);
00052 }
00053 
00054 bool CLinearHMM::train(CFeatures* data)
00055 {
00056     if (data)
00057     {
00058         if (data->get_feature_class() != C_STRING ||
00059                 data->get_feature_type() != F_WORD)
00060         {
00061             SG_ERROR("Expected features of class string type word!\n");
00062         }
00063         set_features(data);
00064     }
00065     SG_FREE(transition_probs);
00066     SG_FREE(log_transition_probs);
00067     int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params);
00068 
00069     int32_t vec;
00070     int32_t i;
00071 
00072     for (i=0; i< num_params; i++)
00073         int_transition_probs[i]=0;
00074 
00075     for (vec=0; vec<features->get_num_vectors(); vec++)
00076     {
00077         int32_t len;
00078         bool free_vec;
00079 
00080         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00081             get_feature_vector(vec, len, free_vec);
00082 
00083         //just count the symbols per position -> transition_probsogram
00084         for (int32_t feat=0; feat<len ; feat++)
00085             int_transition_probs[feat*num_symbols+vector[feat]]++;
00086 
00087         ((CStringFeatures<uint16_t>*) features)->
00088             free_feature_vector(vector, vec, free_vec);
00089     }
00090 
00091     //trade memory for speed
00092     transition_probs=SG_MALLOC(float64_t, num_params);
00093     log_transition_probs=SG_MALLOC(float64_t, num_params);
00094 
00095     for (i=0;i<sequence_length;i++)
00096     {
00097         for (int32_t j=0; j<num_symbols; j++)
00098         {
00099             float64_t sum=0;
00100             int32_t offs=i*num_symbols+
00101                 ((CStringFeatures<uint16_t> *) features)->
00102                     get_masked_symbols((uint16_t)j,(uint8_t) 254);
00103             int32_t original_num_symbols=(int32_t)
00104                 ((CStringFeatures<uint16_t> *) features)->
00105                     get_original_num_symbols();
00106 
00107             for (int32_t k=0; k<original_num_symbols; k++)
00108                 sum+=int_transition_probs[offs+k];
00109 
00110             transition_probs[i*num_symbols+j]=
00111                 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00112                 (sum+((CStringFeatures<uint16_t> *) features)->
00113                     get_original_num_symbols()*pseudo_count);
00114             log_transition_probs[i*num_symbols+j]=
00115                 log(transition_probs[i*num_symbols+j]);
00116         }
00117     }
00118 
00119     SG_FREE(int_transition_probs);
00120     return true;
00121 }
00122 
00123 bool CLinearHMM::train(
00124     const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00125 {
00126     SG_FREE(transition_probs);
00127     SG_FREE(log_transition_probs);
00128     int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params);
00129     int32_t vec;
00130     int32_t i;
00131 
00132     for (i=0; i< num_params; i++)
00133         int_transition_probs[i]=0;
00134 
00135     for (vec=0; vec<num_indizes; vec++)
00136     {
00137         int32_t len;
00138         bool free_vec;
00139 
00140         ASSERT(indizes[vec]>=0 &&
00141             indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00142                 get_num_vectors());
00143         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00144             get_feature_vector(indizes[vec], len, free_vec);
00145         ((CStringFeatures<uint16_t>*) features)->
00146             free_feature_vector(vector, indizes[vec], free_vec);
00147 
00148         //just count the symbols per position -> transition_probsogram
00149         //
00150         for (int32_t feat=0; feat<len ; feat++)
00151             int_transition_probs[feat*num_symbols+vector[feat]]++;
00152     }
00153 
00154     //trade memory for speed
00155     transition_probs=SG_MALLOC(float64_t, num_params);
00156     log_transition_probs=SG_MALLOC(float64_t, num_params);
00157 
00158     for (i=0;i<sequence_length;i++)
00159     {
00160         for (int32_t j=0; j<num_symbols; j++)
00161         {
00162             float64_t sum=0;
00163             int32_t original_num_symbols=(int32_t)
00164                 ((CStringFeatures<uint16_t> *) features)->
00165                     get_original_num_symbols();
00166             for (int32_t k=0; k<original_num_symbols; k++)
00167             {
00168                 sum+=int_transition_probs[i*num_symbols+
00169                     ((CStringFeatures<uint16_t>*) features)->
00170                         get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00171             }
00172 
00173             transition_probs[i*num_symbols+j]=
00174                 (int_transition_probs[i*num_symbols+j]+pseudo)/
00175                 (sum+((CStringFeatures<uint16_t>*) features)->
00176                     get_original_num_symbols()*pseudo);
00177             log_transition_probs[i*num_symbols+j]=
00178                 log(transition_probs[i*num_symbols+j]);
00179         }
00180     }
00181 
00182     SG_FREE(int_transition_probs);
00183     return true;
00184 }
00185 
00186 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00187 {
00188     float64_t result=log_transition_probs[vector[0]];
00189 
00190     for (int32_t i=1; i<len; i++)
00191         result+=log_transition_probs[i*num_symbols+vector[i]];
00192     
00193     return result;
00194 }
00195 
00196 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00197 {
00198     int32_t len;
00199     bool free_vec;
00200     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00201         get_feature_vector(num_example, len, free_vec);
00202     float64_t result=log_transition_probs[vector[0]];
00203 
00204     for (int32_t i=1; i<len; i++)
00205         result+=log_transition_probs[i*num_symbols+vector[i]];
00206 
00207     ((CStringFeatures<uint16_t>*) features)->
00208         free_feature_vector(vector, num_example, free_vec);
00209 
00210     return result;
00211 }
00212 
00213 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00214 {
00215     float64_t result=transition_probs[vector[0]];
00216 
00217     for (int32_t i=1; i<len; i++)
00218         result*=transition_probs[i*num_symbols+vector[i]];
00219     
00220     return result;
00221 }
00222 
00223 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00224 {
00225     int32_t len;
00226     bool free_vec;
00227     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00228         get_feature_vector(num_example, len, free_vec);
00229     float64_t result=0;
00230     int32_t position=num_param/num_symbols;
00231     ASSERT(position>=0 && position<len);
00232     uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00233 
00234     if (vector[position]==sym && transition_probs[num_param]!=0)
00235         result=1.0/transition_probs[num_param];
00236     ((CStringFeatures<uint16_t>*) features)->
00237         free_feature_vector(vector, num_example, free_vec);
00238 
00239     return result;
00240 }
00241 
00242 SGVector<float64_t> CLinearHMM::get_transition_probs()
00243 {
00244     return SGVector<float64_t>(transition_probs, num_params);   
00245 }
00246 
00247 bool CLinearHMM::set_transition_probs(SGVector<float64_t> probs)
00248 {
00249     ASSERT(probs.vlen == num_params);   
00250 
00251     if (!log_transition_probs)
00252         log_transition_probs=SG_MALLOC(float64_t, num_params);
00253 
00254     if (!transition_probs)
00255         transition_probs=SG_MALLOC(float64_t, num_params);
00256 
00257     for (int32_t i=0; i<num_params; i++)
00258     {
00259         transition_probs[i]=probs.vector[i];
00260         log_transition_probs[i]=log(transition_probs[i]);
00261     }
00262 
00263     return true;
00264 }
00265 
00266 SGVector<float64_t> CLinearHMM::get_log_transition_probs()
00267 {
00268     return SGVector<float64_t>(log_transition_probs, num_params);
00269 }
00270 
00271 bool CLinearHMM::set_log_transition_probs(SGVector<float64_t> probs)
00272 {
00273     ASSERT(probs.vlen == num_params);
00274 
00275     if (!log_transition_probs)
00276         log_transition_probs=SG_MALLOC(float64_t, num_params);
00277 
00278     if (!transition_probs)
00279         transition_probs=SG_MALLOC(float64_t, num_params);
00280 
00281     for (int32_t i=0; i<num_params; i++)
00282     {
00283         log_transition_probs[i]=probs.vector[i];
00284         transition_probs[i]=exp(log_transition_probs[i]);
00285     }
00286 
00287     return true;
00288 }
00289 
00290 void CLinearHMM::load_serializable_post() throw (ShogunException)
00291 {
00292     CSGObject::load_serializable_post();
00293 
00294     num_params = sequence_length*num_symbols;
00295 }
00296 
00297 void CLinearHMM::init()
00298 {
00299     sequence_length = 0;
00300     num_symbols = 0;
00301     num_params = 0;
00302     transition_probs = NULL;
00303     log_transition_probs = NULL;
00304 
00305     m_parameters->add_matrix(&transition_probs, &num_symbols, &sequence_length,
00306             "transition_probs", "Transition probabilities.");
00307     m_parameters->add_matrix(&log_transition_probs, &num_symbols, &sequence_length,
00308             "log_transition_probs", "Transition probabilities (logspace).");
00309 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation