Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/distributions/Histogram.h>
00013 #include <shogun/lib/common.h>
00014 #include <shogun/features/StringFeatures.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017
00018 using namespace shogun;
00019
00020 CHistogram::CHistogram()
00021 : CDistribution()
00022 {
00023 hist=SG_CALLOC(float64_t, 1<<16);
00024 }
00025
00026 CHistogram::CHistogram(CStringFeatures<uint16_t> *f)
00027 : CDistribution()
00028 {
00029 hist=SG_CALLOC(float64_t, 1<<16);
00030 features=f;
00031 }
00032
00033 CHistogram::~CHistogram()
00034 {
00035 SG_FREE(hist);
00036 }
00037
00038 bool CHistogram::train(CFeatures* data)
00039 {
00040 int32_t vec;
00041 int32_t feat;
00042 int32_t i;
00043
00044 if (data)
00045 {
00046 if (data->get_feature_class() != C_STRING ||
00047 data->get_feature_type() != F_WORD)
00048 {
00049 SG_ERROR("Expected features of class string type word\n");
00050 }
00051 set_features(data);
00052 }
00053
00054 ASSERT(features);
00055 ASSERT(features->get_feature_class()==C_STRING);
00056 ASSERT(features->get_feature_type()==F_WORD);
00057
00058 for (i=0; i< (int32_t) (1<<16); i++)
00059 hist[i]=0;
00060
00061 for (vec=0; vec<features->get_num_vectors(); vec++)
00062 {
00063 int32_t len;
00064 bool free_vec;
00065
00066 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00067 get_feature_vector(vec, len, free_vec);
00068
00069 for (feat=0; feat<len ; feat++)
00070 hist[vector[feat]]++;
00071
00072 ((CStringFeatures<uint16_t>*) features)->
00073 free_feature_vector(vector, vec, free_vec);
00074 }
00075
00076 for (i=0; i< (int32_t) (1<<16); i++)
00077 hist[i]=log(hist[i]);
00078
00079 return true;
00080 }
00081
00082 float64_t CHistogram::get_log_likelihood_example(int32_t num_example)
00083 {
00084 ASSERT(features);
00085 ASSERT(features->get_feature_class()==C_STRING);
00086 ASSERT(features->get_feature_type()==F_WORD);
00087
00088 int32_t len;
00089 bool free_vec;
00090 float64_t loglik=0;
00091
00092 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00093 get_feature_vector(num_example, len, free_vec);
00094
00095 for (int32_t i=0; i<len; i++)
00096 loglik+=hist[vector[i]];
00097
00098 ((CStringFeatures<uint16_t>*) features)->
00099 free_feature_vector(vector, num_example, free_vec);
00100
00101 return loglik;
00102 }
00103
00104 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example)
00105 {
00106 if (hist[num_param] < CMath::ALMOST_NEG_INFTY)
00107 return -CMath::INFTY;
00108 else
00109 {
00110 ASSERT(features);
00111 ASSERT(features->get_feature_class()==C_STRING);
00112 ASSERT(features->get_feature_type()==F_WORD);
00113
00114 int32_t len;
00115 bool free_vec;
00116 float64_t deriv=0;
00117
00118 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00119 get_feature_vector(num_example, len, free_vec);
00120
00121 int32_t num_occurences=0;
00122
00123 for (int32_t i=0; i<len; i++)
00124 {
00125 deriv+=hist[vector[i]];
00126
00127 if (vector[i]==num_param)
00128 num_occurences++;
00129 }
00130
00131 ((CStringFeatures<uint16_t>*) features)->
00132 free_feature_vector(vector, num_example, free_vec);
00133
00134 if (num_occurences>0)
00135 deriv+=CMath::log((float64_t) num_occurences)-hist[num_param];
00136 else
00137 deriv=-CMath::INFTY;
00138
00139 return deriv;
00140 }
00141 }
00142
00143 float64_t CHistogram::get_log_model_parameter(int32_t num_param)
00144 {
00145 return hist[num_param];
00146 }
00147
00148 bool CHistogram::set_histogram(SGVector<float64_t> histogram)
00149 {
00150 ASSERT(histogram.vlen==get_num_model_parameters());
00151
00152 SG_FREE(hist);
00153 hist=SG_MALLOC(float64_t, histogram.vlen);
00154 for (int32_t i=0; i<histogram.vlen; i++)
00155 hist[i]=histogram.vector[i];
00156
00157 return true;
00158 }
00159
00160 SGVector<float64_t> CHistogram::get_histogram()
00161 {
00162 return SGVector<float64_t>(hist,get_num_model_parameters(),false);
00163 }
00164