SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ID3ClassifierTree.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Parijat Mazumdar
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
37 
38 using namespace shogun;
39 
42 {
43 }
44 
46 {
47 }
48 
50 {
51  REQUIRE(data, "Data required for classification in apply_multiclass\n")
52 
53  node_t* current = get_root();
54  CMulticlassLabels* ret = apply_multiclass_from_current_node((CDenseFeatures<float64_t>*) data, current);
55 
56  SG_UNREF(current);
57  return ret;
58 }
59 
61  CMulticlassLabels* validation_labels, float64_t epsilon)
62 {
63  node_t* current = get_root();
64  prune_tree_machine(validation_data, validation_labels, current, epsilon);
65 
66  SG_UNREF(current);
67  return true;
68 }
69 
71 {
72  REQUIRE(data,"Data required for training\n")
73  REQUIRE(data->get_feature_class()==C_DENSE, "Dense data required for training\n")
74 
75  int32_t num_features = (dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_features();
76  SGVector<int32_t> feature_ids = SGVector<int32_t>(num_features);
77  feature_ids.range_fill();
78 
79  set_root(id3train(data, dynamic_cast<CMulticlassLabels*>(m_labels), feature_ids, 0));
80 
81  return true;
82 }
83 
84 CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
85  CMulticlassLabels* class_labels, SGVector<int32_t> feature_id_vector, int32_t level)
86 {
87  node_t* node = new node_t();
88  CDenseFeatures<float64_t>* feats = dynamic_cast<CDenseFeatures<float64_t>*>(data);
89  int32_t num_vecs = feats->get_num_vectors();
90 
91  // set class_label for the node as the mode of occurring multiclass labels
92  SGVector<float64_t> labels = class_labels->get_labels_copy();
93  CMath::qsort(labels);
94 
95  int32_t most_label = labels[0];
96  int32_t most_num = 1;
97  int32_t count = 1;
98 
99  for (int32_t i=1; i<labels.vlen; i++)
100  {
101  if (labels[i] == labels[i-1])
102  {
103  count++;
104  }
105  else if (count>most_num)
106  {
107  most_num = count;
108  most_label = labels[i-1];
109  count = 1;
110  }
111  else
112  {
113  count = 1;
114  }
115  }
116 
117  node->data.class_label = most_label;
118 
119  // if all samples belong to the same class
120  if (most_num == labels.vlen)
121  return node;
122 
123  // if no feature is left
124  if (feature_id_vector.vlen == 0)
125  return node;
126 
127  // else get the feature with the highest informational gain
128  float64_t max = 0;
129  int32_t best_feature_index = -1;
130  for (int32_t i=0; i<feats->get_num_features(); i++)
131  {
132  float64_t gain = informational_gain_attribute(i,feats,class_labels);
133 
134  if (gain >= max)
135  {
136  max = gain;
137  best_feature_index = i;
138  }
139  }
140 
141  // get feature values for the best feature chosen
142  SGVector<float64_t> best_feature_values = SGVector<float64_t>(num_vecs);
143  for (int32_t i=0; i<num_vecs; i++)
144  best_feature_values[i] = (feats->get_feature_vector(i))[best_feature_index];
145 
146  CMulticlassLabels* best_feature_labels = new CMulticlassLabels(best_feature_values);
147  SGVector<float64_t> best_labels_unique = best_feature_labels->get_unique_labels();
148 
149  for (int32_t i=0; i<best_labels_unique.vlen; i++)
150  {
151  // compute the number of vectors with active attribute value
152  int32_t num_cols = 0;
153  float64_t active_feature_value = best_labels_unique[i];
154 
155  for (int32_t j=0; j<num_vecs; j++)
156  {
157  if ( active_feature_value == best_feature_values[j])
158  num_cols++;
159  }
160 
161  SGMatrix<float64_t> mat = SGMatrix<float64_t>(feats->get_num_features()-1, num_cols);
162  SGVector<float64_t> new_labels_vector = SGVector<float64_t>(num_cols);
163 
164  int32_t cnt = 0;
165  // choose the samples that have the active feature value
166  for (int32_t j=0; j<num_vecs; j++)
167  {
168  SGVector<float64_t> sample = feats->get_feature_vector(j);
169  if (active_feature_value == sample[best_feature_index])
170  {
171  int32_t idx = -1;
172  for (int32_t k=0; k<sample.size(); k++)
173  {
174  if (k != best_feature_index)
175  mat(++idx, cnt) = sample[k];
176  }
177 
178  new_labels_vector[cnt] = class_labels->get_labels()[j];
179  cnt++;
180  }
181  }
182 
183  // remove the best_attribute from the remaining attributes index vector
184  SGVector<int32_t> new_feature_id_vector = SGVector<int32_t>(feature_id_vector.vlen-1);
185  cnt = -1;
186  for (int32_t j=0;j<feature_id_vector.vlen;j++)
187  {
188  if (j!=best_feature_index)
189  new_feature_id_vector[++cnt] = feature_id_vector[j];
190  }
191 
192  CMulticlassLabels* new_class_labels = new CMulticlassLabels(new_labels_vector);
194 
195  node_t* child = id3train(new_data, new_class_labels, new_feature_id_vector, level+1);
196  child->data.transit_if_feature_value = active_feature_value;
197  node->data.attribute_id = feature_id_vector[best_feature_index];
198  node->add_child(child);
199 
200  SG_UNREF(new_class_labels);
201  SG_UNREF(new_data);
202  }
203 
204  SG_UNREF(best_feature_labels);
205 
206  return node;
207 }
208 
209 float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no, CFeatures* data,
210  CMulticlassLabels* class_labels)
211 {
212  REQUIRE(data,"Data required for information gain calculation\n")
213  REQUIRE(data->get_feature_class()==C_DENSE,
214  "Dense data required for information gain calculation\n")
215 
216  float64_t gain = 0;
217  CDenseFeatures<float64_t>* feats = dynamic_cast<CDenseFeatures<float64_t>*>(data);
218  int32_t num_vecs = feats->get_num_vectors();
219 
220  // get attribute values for attribute
221  SGVector<float64_t> attribute_values = SGVector<float64_t>(num_vecs);
222 
223  for (int32_t i=0; i<num_vecs; i++)
224  attribute_values[i] = (feats->get_feature_vector(i))[attr_no];
225 
226  CMulticlassLabels* attribute_labels = new CMulticlassLabels(attribute_values);
227  SGVector<float64_t> attr_val_unique = attribute_labels->get_unique_labels();
228 
229  for (int32_t i=0; i<attr_val_unique.vlen; i++)
230  {
231  // calculate class entropy for the specific attribute_value
232  int32_t attr_count=0;
233 
234  for (int32_t j=0; j<num_vecs; j++)
235  {
236  if (attribute_values[j] == attr_val_unique[i])
237  attr_count++;
238  }
239 
240  SGVector<float64_t> sub_class = SGVector<float64_t>(attr_count);
241  int32_t count = 0;
242 
243  for (int32_t j=0; j<num_vecs; j++)
244  {
245  if (attribute_values[j] == attr_val_unique[i])
246  sub_class[count++] = class_labels->get_label(j);
247  }
248 
249  CMulticlassLabels* sub_labels = new CMulticlassLabels(sub_class);
250  float64_t sub_entropy = entropy(sub_labels);
251  gain += sub_entropy*(attr_count-0.f)/(num_vecs-0.f);
252 
253  SG_UNREF(sub_labels);
254  }
255 
256  float64_t data_entropy = entropy(class_labels);
257  gain = data_entropy-gain;
258 
259  SG_UNREF(attribute_labels);
260 
261  return gain;
262 }
263 
264 float64_t CID3ClassifierTree::entropy(CMulticlassLabels* labels)
265 {
267  (labels->get_unique_labels().size());
268 
269  for (int32_t i=0;i<labels->get_unique_labels().size();i++)
270  {
271  int32_t count = 0;
272 
273  for (int32_t j=0;j<labels->get_num_labels();j++)
274  {
275  if (labels->get_unique_labels()[i] == labels->get_label(j))
276  count++;
277  }
278 
279  log_ratios[i] = (count-0.f)/(labels->get_num_labels()-0.f);
280 
281  if (log_ratios[i] != 0)
282  log_ratios[i] = CMath::log(log_ratios[i]);
283  }
284 
285  return CStatistics::entropy(log_ratios.vector, log_ratios.vlen);
286 }
287 
288 void CID3ClassifierTree::prune_tree_machine(CDenseFeatures<float64_t>* feats,
289  CMulticlassLabels* gnd_truth, node_t* current, float64_t epsilon)
290 {
291  SGMatrix<float64_t> feature_matrix = feats->get_feature_matrix();
292  CDynamicObjectArray* children = current->get_children();
293 
294  for (int32_t i=0; i<children->get_num_elements(); i++)
295  {
296  // count number of feature vectors which transit into the child
297  int32_t count = 0;
298  node_t* child = dynamic_cast<node_t*>(children->get_element(i));
299 
300  for (int32_t j=0; j<feature_matrix.num_cols; j++)
301  {
302  float child_transit = child->data.transit_if_feature_value;
303 
304  if (child_transit == feature_matrix(current->data.attribute_id,j))
305  count++;
306  }
307 
308  // form new subset of features and labels
309  SGVector<index_t> subset = SGVector<index_t>(count);
310  int32_t k = 0;
311 
312  for (int32_t j=0; j<feature_matrix.num_cols;j++)
313  {
314  float child_transit = child->data.transit_if_feature_value;
315 
316  if (child_transit == feature_matrix(current->data.attribute_id,j))
317  {
318  subset[k] = (index_t) j;
319  k++;
320  }
321  }
322 
323  feats->add_subset(subset);
324  gnd_truth->add_subset(subset);
325 
326  // prune the child subtree
327  prune_tree_machine(feats, gnd_truth, child, epsilon);
328 
329  feats->remove_subset();
330  gnd_truth->remove_subset();
331 
332  SG_UNREF(child);
333  }
334 
335  SG_UNREF(children);
336 
337  CMulticlassLabels* predicted_unpruned = apply_multiclass_from_current_node(feats, current);
338  SGVector<float64_t> pruned_labels = SGVector<float64_t>(feature_matrix.num_cols);
339  for (int32_t i=0; i<feature_matrix.num_cols; i++)
340  pruned_labels[i] = current->data.class_label;
341 
342  CMulticlassLabels* predicted_pruned = new CMulticlassLabels(pruned_labels);
343 
344  CMulticlassAccuracy* accuracy = new CMulticlassAccuracy();
345  float64_t unpruned_accuracy = accuracy->evaluate(predicted_unpruned, gnd_truth);
346  float64_t pruned_accuracy = accuracy->evaluate(predicted_pruned, gnd_truth);
347 
348  if (unpruned_accuracy<pruned_accuracy+epsilon)
349  {
350  CDynamicObjectArray* null_children = new CDynamicObjectArray();
351  current->set_children(null_children);
352  SG_UNREF(null_children);
353  }
354 
355  SG_UNREF(accuracy);
356  SG_UNREF(predicted_pruned);
357  SG_UNREF(predicted_unpruned);
358 }
359 
360 CMulticlassLabels* CID3ClassifierTree::apply_multiclass_from_current_node(CDenseFeatures<float64_t>* feats,
361  node_t* current)
362 {
363  REQUIRE(feats, "Features should not be NULL")
364  REQUIRE(current, "Current node should not be NULL")
365 
366  int32_t num_vecs = feats->get_num_vectors();
367  SGVector<float64_t> labels = SGVector<float64_t>(num_vecs);
368 
369  // classify vectors in feature matrix taking one at a time
370  for (int32_t i=0; i<num_vecs; i++)
371  {
372  // choose the current subtree as the entry point
373  SGVector<float64_t> sample = feats->get_feature_vector(i);
374  node_t* node = current;
375  SG_REF(node);
376  CDynamicObjectArray* children = node->get_children();
377 
378  // traverse the subtree until leaf node is reached
379  while (children->get_num_elements())
380  {
381  bool flag = false;
382  for (int32_t j=0; j<children->get_num_elements(); j++)
383  {
384  node_t* child = dynamic_cast<node_t*>(children->get_element(j));
385  if (child->data.transit_if_feature_value
386  == sample[node->data.attribute_id])
387  {
388  flag = true;
389 
390  SG_UNREF(node);
391  node = child;
392 
393  SG_UNREF(children);
394  children = node->get_children();
395 
396  break;
397  }
398 
399  SG_UNREF(child);
400  }
401 
402  if (!flag)
403  break;
404  }
405 
406  // class_label of leaf node is the class to which chosen vector belongs
407  labels[i] = node->data.class_label;
408 
409  SG_UNREF(node);
410  SG_UNREF(children);
411  }
412 
413  CMulticlassLabels* ret = new CMulticlassLabels(labels);
414  return ret;
415 }
CTreeMachineNode< id3TreeNodeData > node_t
Definition: TreeMachine.h:52
void range_fill(T start=0)
Definition: SGVector.cpp:171
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_num_features() const
virtual int32_t get_num_labels() const
int32_t index_t
Definition: common.h:62
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)
SGMatrix< ST > get_feature_matrix()
float64_t transit_if_feature_value
SGVector< float64_t > get_unique_labels()
CLabels * m_labels
Definition: Machine.h:361
#define REQUIRE(x,...)
Definition: SGIO.h:206
The class MulticlassAccuracy used to compute accuracy of multiclass classification.
index_t num_cols
Definition: SGMatrix.h:376
CTreeMachineNode< id3TreeNodeData > * get_root()
Definition: TreeMachine.h:88
float64_t get_label(int32_t idx)
structure to store data of a node of id3 tree. This can be used as a template type in TreeMachineNode...
SGVector< float64_t > get_labels_copy()
Definition: DenseLabels.cpp:90
#define SG_REF(x)
Definition: SGObject.h:54
void set_root(CTreeMachineNode< id3TreeNodeData > *root)
Definition: TreeMachine.h:78
SGVector< float64_t > get_labels()
Definition: DenseLabels.cpp:82
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
Multiclass Labels for multi-class classification.
int32_t size() const
Definition: SGVector.h:113
index_t vlen
Definition: SGVector.h:494
virtual int32_t get_num_vectors() const
static float64_t entropy(float64_t *p, int32_t len)
Definition: Statistics.cpp:312
double float64_t
Definition: common.h:50
virtual void remove_subset()
Definition: Labels.cpp:49
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:39
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
virtual bool train_machine(CFeatures *data=NULL)
The class Features is the base class of all feature objects.
Definition: Features.h:68
static float64_t log(float64_t v)
Definition: Math.h:922
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
Definition: Redux.h:68
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
Definition: TreeMachine.h:48
bool prune_tree(CDenseFeatures< float64_t > *validation_data, CMulticlassLabels *validation_labels, float64_t epsilon=0.f)
T * data() const
Definition: SGVector.h:116
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310

SHOGUN Machine Learning Toolbox - Documentation