SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GaussianNaiveBayes.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Sergey Lisitsyn
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
13 #include <shogun/labels/Labels.h>
17 #include <shogun/lib/Signal.h>
18 
19 using namespace shogun;
20 
22  m_min_label(0), m_num_classes(0), m_dim(0), m_means(), m_variances(),
23  m_label_prob(), m_rates()
24 {
25 
26 };
27 
29  CLabels* train_labels) : CNativeMulticlassMachine(), m_features(NULL),
30  m_min_label(0), m_num_classes(0), m_dim(0), m_means(),
31  m_variances(), m_label_prob(), m_rates()
32 {
33  ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels())
34  set_labels(train_labels);
35 
36  if (!train_examples->has_property(FP_DOT))
37  SG_ERROR("Specified features are not of type CDotFeatures\n")
38 
39  set_features((CDotFeatures*)train_examples);
40 };
41 
43 {
45 };
46 
48 {
50  return m_features;
51 }
52 
54 {
55  if (!features->has_property(FP_DOT))
56  SG_ERROR("Specified features are not of type CDotFeatures\n")
57 
58  SG_REF(features);
60  m_features = (CDotFeatures*)features;
61 }
62 
64 {
65  // init features with data if necessary and assure type is correct
66  if (data)
67  {
68  if (!data->has_property(FP_DOT))
69  SG_ERROR("Specified features are not of type CDotFeatures\n")
70  set_features((CDotFeatures*) data);
71  }
72 
73  // get int labels to train_labels and check length equality
76  SGVector<int32_t> train_labels = ((CMulticlassLabels*) m_labels)->get_int_labels();
77  ASSERT(m_features->get_num_vectors()==train_labels.vlen)
78 
79  // init min_label, max_label and loop variables
80  int32_t min_label = train_labels.vector[0];
81  int32_t max_label = train_labels.vector[0];
82  int i,j;
83 
84  // find minimal and maximal label
85  for (i=1; i<train_labels.vlen; i++)
86  {
87  min_label = CMath::min(min_label, train_labels.vector[i]);
88  max_label = CMath::max(max_label, train_labels.vector[i]);
89  }
90 
91  // subtract minimal label from all labels
92  for (i=0; i<train_labels.vlen; i++)
93  train_labels.vector[i]-= min_label;
94 
95  // get number of classes, minimal label and dimensionality
96  m_num_classes = max_label-min_label+1;
97  m_min_label = min_label;
99 
100  // allocate memory for distributions' parameters and a priori probability
104 
105  // allocate memory for label rates
107 
108  // make arrays filled by zeros before using
109  m_means.zero();
110  m_variances.zero();
111  m_label_prob.zero();
112  m_rates.zero();
113 
114  // number of iterations in all cycles
115  int32_t max_progress = 2 * train_labels.vlen + 2 * m_num_classes;
116 
117  // current progress
118  int32_t progress = 0;
119  SG_PROGRESS(progress, 0, max_progress)
120 
121  // get sum of features among labels
122  for (i=0; i<train_labels.vlen; i++)
123  {
125  for (j=0; j<m_dim; j++)
126  m_means(j, train_labels.vector[i]) += fea.vector[j];
127 
128  m_label_prob.vector[train_labels.vector[i]]+=1.0;
129 
130  progress++;
131  SG_PROGRESS(progress, 0, max_progress)
132  }
133 
134  // get means of features of labels
135  for (i=0; i<m_num_classes; i++)
136  {
137  for (j=0; j<m_dim; j++)
138  m_means(j, i) /= m_label_prob.vector[i];
139 
140  progress++;
141  SG_PROGRESS(progress, 0, max_progress)
142  }
143 
144  // compute squared residuals with means available
145  for (i=0; i<train_labels.vlen; i++)
146  {
148  for (j=0; j<m_dim; j++)
149  {
150  m_variances(j, train_labels.vector[i]) +=
151  CMath::sq(fea[j]-m_means(j, train_labels.vector[i]));
152  }
153 
154  progress++;
155  SG_PROGRESS(progress, 0, max_progress)
156  }
157 
158  // get variance of features of labels
159  for (i=0; i<m_num_classes; i++)
160  {
161  for (j=0; j<m_dim; j++)
162  m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
163 
164  // get a priori probabilities of labels
165  m_label_prob.vector[i]/= m_num_classes;
166 
167  progress++;
168  SG_PROGRESS(progress, 0, max_progress)
169  }
170  SG_DONE()
171 
172  return true;
173 }
174 
176 {
177  if (data)
178  set_features(data);
179 
181 
182  // init number of vectors
183  int32_t num_vectors = m_features->get_num_vectors();
184 
185  // init result labels
186  CMulticlassLabels* result = new CMulticlassLabels(num_vectors);
187 
188  // classify each example of data
189  SG_PROGRESS(0, 0, num_vectors)
190  for (int i = 0; i < num_vectors; i++)
191  {
192  result->set_label(i,apply_one(i));
193  SG_PROGRESS(i + 1, 0, num_vectors)
194  }
195  SG_DONE()
196  return result;
197 };
198 
200 {
201  // get [idx] feature vector
203 
204  // init loop variables
205  int i,k;
206 
207  // rate all labels
208  for (i=0; i<m_num_classes; i++)
209  {
210  // set rate to 0.0 if a priori probability is 0.0 and continue
211  if (m_label_prob.vector[i]==0.0)
212  {
213  m_rates.vector[i] = 0.0;
214  continue;
215  }
216  else
218 
219  // product all conditional gaussian probabilities
220  for (k=0; k<m_dim; k++)
221  if (m_variances(k,i)!=0.0)
222  m_rates.vector[i]+= CMath::log(0.39894228/CMath::sqrt(m_variances(k, i))) -
223  0.5*CMath::sq(feature_vector.vector[k]-m_means(k, i))/(m_variances(k, i));
224  }
225 
226  // find label with maximum rate
227  int32_t max_label_idx = 0;
228 
229  for (i=0; i<m_num_classes; i++)
230  {
231  if (m_rates.vector[i]>m_rates.vector[max_label_idx])
232  max_label_idx = i;
233  }
234 
235  return max_label_idx+m_min_label;
236 };

SHOGUN Machine Learning Toolbox - Documentation