Histogram.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/distributions/Histogram.h>
00013 #include <shogun/lib/common.h>
00014 #include <shogun/features/StringFeatures.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017 
00018 using namespace shogun;
00019 
00020 CHistogram::CHistogram()
00021 : CDistribution()
00022 {
00023     hist=SG_CALLOC(float64_t, 1<<16);
00024 }
00025 
00026 CHistogram::CHistogram(CStringFeatures<uint16_t> *f)
00027 : CDistribution()
00028 {
00029     hist=SG_CALLOC(float64_t, 1<<16);
00030     features=f;
00031 }
00032 
00033 CHistogram::~CHistogram()
00034 {
00035     SG_FREE(hist);
00036 }
00037 
00038 bool CHistogram::train(CFeatures* data)
00039 {
00040     int32_t vec;
00041     int32_t feat;
00042     int32_t i;
00043 
00044     if (data)
00045     {
00046         if (data->get_feature_class() != C_STRING ||
00047                 data->get_feature_type() != F_WORD)
00048         {
00049             SG_ERROR("Expected features of class string type word\n");
00050         }
00051         set_features(data);
00052     }
00053 
00054     ASSERT(features);
00055     ASSERT(features->get_feature_class()==C_STRING);
00056     ASSERT(features->get_feature_type()==F_WORD);
00057 
00058     for (i=0; i< (int32_t) (1<<16); i++)
00059         hist[i]=0;
00060 
00061     for (vec=0; vec<features->get_num_vectors(); vec++)
00062     {
00063         int32_t len;
00064         bool free_vec;
00065 
00066         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00067             get_feature_vector(vec, len, free_vec);
00068 
00069         for (feat=0; feat<len ; feat++)
00070             hist[vector[feat]]++;
00071 
00072         ((CStringFeatures<uint16_t>*) features)->
00073             free_feature_vector(vector, vec, free_vec);
00074     }
00075 
00076     for (i=0; i< (int32_t) (1<<16); i++)
00077         hist[i]=log(hist[i]);
00078 
00079     return true;
00080 }
00081 
00082 float64_t CHistogram::get_log_likelihood_example(int32_t num_example)
00083 {
00084     ASSERT(features);
00085     ASSERT(features->get_feature_class()==C_STRING);
00086     ASSERT(features->get_feature_type()==F_WORD);
00087 
00088     int32_t len;
00089     bool free_vec;
00090     float64_t loglik=0;
00091 
00092     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00093         get_feature_vector(num_example, len, free_vec);
00094 
00095     for (int32_t i=0; i<len; i++)
00096         loglik+=hist[vector[i]];
00097 
00098     ((CStringFeatures<uint16_t>*) features)->
00099         free_feature_vector(vector, num_example, free_vec);
00100 
00101     return loglik;
00102 }
00103 
00104 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example)
00105 {
00106     if (hist[num_param] < CMath::ALMOST_NEG_INFTY)
00107         return -CMath::INFTY;
00108     else
00109     {
00110         ASSERT(features);
00111         ASSERT(features->get_feature_class()==C_STRING);
00112         ASSERT(features->get_feature_type()==F_WORD);
00113 
00114         int32_t len;
00115         bool free_vec;
00116         float64_t deriv=0;
00117 
00118         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00119             get_feature_vector(num_example, len, free_vec);
00120 
00121         int32_t num_occurences=0;
00122 
00123         for (int32_t i=0; i<len; i++)
00124         {
00125             deriv+=hist[vector[i]];
00126 
00127             if (vector[i]==num_param)
00128                 num_occurences++;
00129         }
00130 
00131         ((CStringFeatures<uint16_t>*) features)->
00132             free_feature_vector(vector, num_example, free_vec);
00133 
00134         if (num_occurences>0)
00135             deriv+=CMath::log((float64_t) num_occurences)-hist[num_param];
00136         else
00137             deriv=-CMath::INFTY;
00138 
00139         return deriv;
00140     }
00141 }
00142 
00143 float64_t CHistogram::get_log_model_parameter(int32_t num_param)
00144 {
00145     return hist[num_param];
00146 }
00147 
00148 bool CHistogram::set_histogram(const SGVector<float64_t> histogram)
00149 {
00150     ASSERT(histogram.vlen==get_num_model_parameters());
00151 
00152     SG_FREE(hist);
00153     hist=SG_MALLOC(float64_t, histogram.vlen);
00154     for (int32_t i=0; i<histogram.vlen; i++)
00155         hist[i]=histogram.vector[i];
00156 
00157     return true;
00158 }
00159 
00160 SGVector<float64_t> CHistogram::get_histogram()
00161 {
00162     return SGVector<float64_t>(hist,get_num_model_parameters(),false);
00163 }
00164 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation