GaussianNaiveBayes.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Sergey Lisitsyn
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/classifier/GaussianNaiveBayes.h>
00012 #include <shogun/machine/Machine.h>
00013 #include <shogun/features/Features.h>
00014 #include <shogun/features/Labels.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/lib/Signal.h>
00017 
00018 using namespace shogun;
00019 
00020 CGaussianNaiveBayes::CGaussianNaiveBayes() :
00021 CMachine(), m_features(NULL), m_min_label(0),
00022 m_num_classes(0), m_dim(0), m_means(),
00023 m_variances(), m_label_prob(), m_rates()
00024 {
00025 
00026 };
00027 
00028 CGaussianNaiveBayes::CGaussianNaiveBayes(CFeatures* train_examples, CLabels* train_labels) :
00029 CMachine(), m_features(NULL), m_min_label(0),
00030 m_num_classes(0), m_dim(0), m_means(),
00031 m_variances(), m_label_prob(), m_rates()
00032 {
00033     ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels());
00034     set_labels(train_labels);
00035     if (!train_examples->has_property(FP_DOT))
00036         SG_ERROR("Specified features are not of type CDotFeatures\n");
00037     set_features((CDotFeatures*)train_examples);
00038 };
00039 
00040 CGaussianNaiveBayes::~CGaussianNaiveBayes()
00041 {
00042     SG_UNREF(m_features);
00043 
00044     m_means.destroy_vector();
00045     m_rates.destroy_vector();
00046     m_variances.destroy_vector();
00047     m_label_prob.destroy_vector();
00048 };
00049 
00050 bool CGaussianNaiveBayes::train(CFeatures* data)
00051 {
00052     // init features with data if necessary and assure type is correct
00053     if (data)
00054     {
00055         if (!data->has_property(FP_DOT))
00056                 SG_ERROR("Specified features are not of type CDotFeatures\n");
00057         set_features((CDotFeatures*) data);
00058     }
00059     // get int labels to train_labels and check length equality
00060     ASSERT(labels);
00061     SGVector<int32_t> train_labels = labels->get_int_labels();
00062     ASSERT(m_features->get_num_vectors()==train_labels.vlen);
00063 
00064     // init min_label, max_label and loop variables
00065     int32_t min_label = train_labels.vector[0];
00066     int32_t max_label = train_labels.vector[0];
00067     int i,j;
00068 
00069     // find minimal and maximal label
00070     for (i=1; i<train_labels.vlen; i++)
00071     {
00072         min_label = CMath::min(min_label, train_labels.vector[i]);
00073         max_label = CMath::max(max_label, train_labels.vector[i]);
00074     }
00075 
00076     // subtract minimal label from all labels
00077     for (i=0; i<train_labels.vlen; i++)
00078         train_labels.vector[i]-= min_label;
00079 
00080     // get number of classes, minimal label and dimensionality
00081     m_num_classes = max_label-min_label+1;
00082     m_min_label = min_label;
00083     m_dim = m_features->get_dim_feature_space();
00084 
00085     // allocate memory for distributions' parameters and a priori probability
00086     m_means.vector = SG_MALLOC(float64_t, m_num_classes*m_dim);
00087     m_means.vlen = m_num_classes*m_dim;
00088 
00089     m_variances.vector = SG_MALLOC(float64_t, m_num_classes*m_dim);
00090     m_variances.vlen = m_num_classes*m_dim;
00091 
00092     m_label_prob.vector = SG_MALLOC(float64_t, m_num_classes);
00093     m_label_prob.vlen = m_num_classes;
00094 
00095     // allocate memory for label rates
00096     m_rates.vector = SG_MALLOC(float64_t, m_num_classes);
00097     m_rates.vlen = m_num_classes;
00098 
00099     // assure that memory is allocated
00100     ASSERT(m_means.vector);
00101     ASSERT(m_variances.vector);
00102     ASSERT(m_rates.vector);
00103     ASSERT(m_label_prob.vector);
00104 
00105     // make arrays filled by zeros before using
00106     for (i=0;i<m_num_classes*m_dim;i++)
00107     {
00108         m_means.vector[i] = 0.0;
00109         m_variances.vector[i] = 0.0;
00110     }
00111     for (i=0;i<m_num_classes;i++)
00112     {
00113         m_label_prob.vector[i] = 0.0;
00114         m_rates.vector[i] = 0.0;
00115     }
00116 
00117     SGMatrix<float64_t> feature_matrix = m_features->get_computed_dot_feature_matrix();
00118 
00119     // get sum of features among labels
00120     for (i=0; i<train_labels.vlen; i++)
00121     {
00122         for (j=0; j<m_dim; j++)
00123             m_means.vector[m_dim*train_labels.vector[i]+j]+=feature_matrix.matrix[i*m_dim+j];
00124 
00125         m_label_prob.vector[train_labels.vector[i]]+=1.0;
00126     }
00127 
00128     // get means of features of labels
00129     for (i=0; i<m_num_classes; i++)
00130     {
00131         for (j=0; j<m_dim; j++)
00132             m_means.vector[m_dim*i+j] /= m_label_prob.vector[i];
00133     }
00134 
00135     // compute squared residuals with means available
00136     for (i=0; i<train_labels.vlen; i++)
00137     {
00138         for (j=0; j<m_dim; j++)
00139             m_variances.vector[m_dim*train_labels.vector[i]+j]+=
00140                     CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means.vector[m_dim*train_labels.vector[i]+j]);
00141     }
00142 
00143     // get variance of features of labels
00144     for (i=0; i<m_num_classes; i++)
00145     {
00146         for (j=0; j<m_dim; j++)
00147             m_variances.vector[m_dim*i+j] /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
00148     }
00149 
00150     // get a priori probabilities of labels
00151     for (i=0; i<m_num_classes; i++)
00152     {
00153         m_label_prob.vector[i]/= m_num_classes;
00154     }
00155 
00156     train_labels.free_vector();
00157 
00158     return true;
00159 }
00160 
00161 CLabels* CGaussianNaiveBayes::apply()
00162 {
00163     // init number of vectors
00164     int32_t n = m_features->get_num_vectors();
00165 
00166     // init result labels
00167     CLabels* result = new CLabels(n);
00168 
00169     // classify each example of data
00170     for (int i=0; i<n; i++)
00171         result->set_label(i,apply(i));
00172 
00173     return result;
00174 };
00175 
00176 CLabels* CGaussianNaiveBayes::apply(CFeatures* data)
00177 {
00178     // check data correctness
00179     if (!data)
00180         SG_ERROR("No features specified\n");
00181     if (!data->has_property(FP_DOT))
00182         SG_ERROR("Specified features are not of type CDotFeatures\n");
00183 
00184     // set features to classify
00185     set_features((CDotFeatures*)data);
00186 
00187     // classify using features
00188     return apply();
00189 };
00190 
00191 float64_t CGaussianNaiveBayes::apply(int32_t idx)
00192 {
00193     // get [idx] feature vector
00194     SGVector<float64_t> feature_vector = m_features->get_computed_dot_feature_vector(idx);
00195 
00196     // init loop variables
00197     int i,k;
00198 
00199     // rate all labels
00200     for (i=0; i<m_num_classes; i++)
00201     {
00202         // set rate to 0.0 if a priori probability is 0.0 and continue
00203         if (m_label_prob.vector[i]==0.0)
00204         {
00205             m_rates.vector[i] = 0.0;
00206             continue;
00207         }
00208         else
00209             m_rates.vector[i] = m_label_prob.vector[i];
00210 
00211         // product all conditional gaussian probabilities
00212         for (k=0; k<m_dim; k++)
00213             m_rates.vector[i]*= normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances.vector[i*m_dim+k]);
00214     }
00215 
00216     // find label with maximum rate
00217     int32_t max_label_idx = 0;
00218 
00219     for (i=0; i<m_num_classes; i++)
00220     {
00221         if (m_rates.vector[i]>m_rates.vector[max_label_idx])
00222             max_label_idx = i;
00223     }
00224 
00225     return max_label_idx+m_min_label;
00226 };
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation