Histogram.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "distributions/Histogram.h"
00013 #include "lib/common.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016 #include "lib/Mathematics.h"
00017 
00018 using namespace shogun;
00019 
00020 CHistogram::CHistogram()
00021 : CDistribution()
00022 {
00023     hist=new float64_t[1<<16];
00024 }
00025 
00026 CHistogram::CHistogram(CStringFeatures<uint16_t> *f)
00027 : CDistribution()
00028 {
00029     hist=new float64_t[1<<16];
00030     features=f;
00031 }
00032 
00033 CHistogram::~CHistogram()
00034 {
00035     delete[] hist;
00036 }
00037 
00038 bool CHistogram::train(CFeatures* data)
00039 {
00040     int32_t vec;
00041     int32_t feat;
00042     int32_t i;
00043 
00044     if (data)
00045     {
00046         if (data->get_feature_class() != C_STRING ||
00047                 data->get_feature_type() != F_WORD)
00048         {
00049             SG_ERROR("Expected features of class string type word\n");
00050         }
00051         set_features(data);
00052     }
00053 
00054     ASSERT(features);
00055     ASSERT(features->get_feature_class()==C_STRING);
00056     ASSERT(features->get_feature_type()==F_WORD);
00057 
00058     for (i=0; i< (int32_t) (1<<16); i++)
00059         hist[i]=0;
00060 
00061     for (vec=0; vec<features->get_num_vectors(); vec++)
00062     {
00063         int32_t len;
00064         bool free_vec;
00065 
00066         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00067             get_feature_vector(vec, len, free_vec);
00068 
00069         for (feat=0; feat<len ; feat++)
00070             hist[vector[feat]]++;
00071 
00072         ((CStringFeatures<uint16_t>*) features)->
00073             free_feature_vector(vector, vec, free_vec);
00074     }
00075 
00076     for (i=0; i< (int32_t) (1<<16); i++)
00077         hist[i]=log(hist[i]);
00078 
00079     return true;
00080 }
00081 
00082 float64_t CHistogram::get_log_likelihood_example(int32_t num_example)
00083 {
00084     ASSERT(features);
00085     ASSERT(features->get_feature_class()==C_STRING);
00086     ASSERT(features->get_feature_type()==F_WORD);
00087 
00088     int32_t len;
00089     bool free_vec;
00090     float64_t loglik=0;
00091 
00092     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00093         get_feature_vector(num_example, len, free_vec);
00094 
00095     for (int32_t i=0; i<len; i++)
00096         loglik+=hist[vector[i]];
00097 
00098     ((CStringFeatures<uint16_t>*) features)->
00099         free_feature_vector(vector, num_example, free_vec);
00100 
00101     return loglik;
00102 }
00103 
00104 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example)
00105 {
00106     if (hist[num_param] < CMath::ALMOST_NEG_INFTY)
00107         return -CMath::INFTY;
00108     else
00109     {
00110         ASSERT(features);
00111         ASSERT(features->get_feature_class()==C_STRING);
00112         ASSERT(features->get_feature_type()==F_WORD);
00113 
00114         int32_t len;
00115         bool free_vec;
00116         float64_t deriv=0;
00117 
00118         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00119             get_feature_vector(num_example, len, free_vec);
00120 
00121         int32_t num_occurences=0;
00122 
00123         for (int32_t i=0; i<len; i++)
00124         {
00125             deriv+=hist[vector[i]];
00126 
00127             if (vector[i]==num_param)
00128                 num_occurences++;
00129         }
00130 
00131         ((CStringFeatures<uint16_t>*) features)->
00132             free_feature_vector(vector, num_example, free_vec);
00133 
00134         if (num_occurences>0)
00135             deriv+=CMath::log((float64_t) num_occurences)-hist[num_param];
00136         else
00137             deriv=-CMath::INFTY;
00138 
00139         return deriv;
00140     }
00141 }
00142 
00143 float64_t CHistogram::get_log_model_parameter(int32_t num_param)
00144 {
00145     return hist[num_param];
00146 }
00147 
00148 bool CHistogram::set_histogram(float64_t* src, int32_t num)
00149 {
00150     ASSERT(num==get_num_model_parameters());
00151 
00152     delete[] hist;
00153     hist=new float64_t[num];
00154     for (int32_t i=0; i<num; i++) {
00155         hist[i]=src[i];
00156     }
00157 
00158     return true;
00159 }
00160 
00161 void CHistogram::get_histogram(float64_t** dst, int32_t* num)
00162 {
00163     *num=get_num_model_parameters();
00164     size_t sz=sizeof(*hist)*(*num);
00165     *dst=(float64_t*) malloc(sz);
00166     ASSERT(dst);
00167 
00168     memcpy(*dst, hist, sz);
00169 }
00170 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation