SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
BaggingMachine.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Viktor Gal
8  * Copyright (C) 2013 Viktor Gal
9  */
10 
14 
15 using namespace shogun;
16 
18  : CMachine()
19 {
20  init();
22 }
23 
25  : CMachine()
26 {
27  init();
29 
30  set_labels(labels);
31 
32  SG_REF(features);
33  m_features = features;
34 }
35 
37 {
43 }
44 
46 {
47  SGVector<float64_t> combined_vector = apply_get_outputs(data);
48 
49  CBinaryLabels* pred = new CBinaryLabels(combined_vector);
50  return pred;
51 }
52 
54 {
55  SGVector<float64_t> combined_vector = apply_get_outputs(data);
56 
57  CMulticlassLabels* pred = new CMulticlassLabels(combined_vector);
58  return pred;
59 }
60 
62 {
63  SGVector<float64_t> combined_vector = apply_get_outputs(data);
64 
65  CRegressionLabels* pred = new CRegressionLabels(combined_vector);
66 
67  return pred;
68 }
69 
71 {
72  ASSERT(data != NULL);
73  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
75 
77  output.zero();
78 
79  /*
80  #pragma omp parallel for num_threads(parallel->get_num_threads())
81  */
82  for (int32_t i = 0; i < m_num_bags; ++i)
83  {
84  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
85  CLabels* l = m->apply(data);
87  if (l!=NULL)
88  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
89  else
90  SG_ERROR("NULL returned by apply method\n");
91 
92  float64_t* bag_results = output.get_column_vector(i);
93  memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));
94 
95  SG_UNREF(l);
96  SG_UNREF(m);
97  }
98 
100 
101  return combined;
102 }
103 
105 {
106  REQUIRE(m_machine != NULL, "Machine is not set!");
107  REQUIRE(m_num_bags > 0, "Number of bag is not set!");
108 
109  if (data)
110  {
111  SG_REF(data);
113  m_features = data;
114 
116  }
117 
118  // if bag size is not provided, set it equal to number of training vectors
119  if (m_bag_size==0)
121 
122  // clear the array, if previously trained
123  m_bags->reset_array();
124 
125  // reset the oob index vector
128 
131 
132  /*
133  TODO: enable multi-threaded learning. This requires views support
134  on CFeatures
135  #pragma omp parallel for num_threads(parallel->get_num_threads())
136  */
137  for (int32_t i = 0; i < m_num_bags; ++i)
138  {
139  CMachine* c=dynamic_cast<CMachine*>(m_machine->clone());
140  ASSERT(c != NULL);
142  idx.random(0, m_features->get_num_vectors()-1);
143  m_labels->add_subset(idx);
144  /* TODO:
145  if it's a binary labeling ensure that
146  there's always samples of both classes
147  if ((m_labels->get_label_type() == LT_BINARY))
148  {
149  while (true) {
150  if (!m_labels->ensure_valid()) {
151  m_labels->remove_subset();
152  idx.random(0, m_features->get_num_vectors());
153  m_labels->add_subset(idx);
154  continue;
155  }
156  break;
157  }
158  }
159  */
160  m_features->add_subset(idx);
161  set_machine_parameters(c,idx);
162  c->set_labels(m_labels);
163  c->train(m_features);
166 
167  // get out of bag indexes
169  m_oob_indices->push_back(oob);
170 
171  // add trained machine to bag array
172  m_bags->push_back(c);
173 
174  SG_UNREF(c);
175  }
176 
177  return true;
178 }
179 
181 {
182 }
183 
185 {
186  SG_ADD((CSGObject**)&m_features, "features", "Train features for bagging",
188  SG_ADD(&m_num_bags, "num_bags", "Number of bags", MS_AVAILABLE);
189  SG_ADD(&m_bag_size, "bag_size", "Number of vectors per bag", MS_AVAILABLE);
190  SG_ADD((CSGObject**)&m_bags, "bags", "Bags array", MS_NOT_AVAILABLE);
191  SG_ADD((CSGObject**)&m_combination_rule, "combination_rule",
192  "Combination rule to use for aggregating", MS_AVAILABLE);
193  SG_ADD(&m_all_oob_idx, "all_oob_idx", "Indices of all oob vectors",
195  SG_ADD((CSGObject**)&m_oob_indices, "oob_indices",
196  "OOB indices for each machine", MS_NOT_AVAILABLE);
197 }
198 
199 void CBaggingMachine::set_num_bags(int32_t num_bags)
200 {
201  m_num_bags = num_bags;
202 }
203 
205 {
206  return m_num_bags;
207 }
208 
209 void CBaggingMachine::set_bag_size(int32_t bag_size)
210 {
211  m_bag_size = bag_size;
212 }
213 
215 {
216  return m_bag_size;
217 }
218 
220 {
221  SG_REF(m_machine);
222  return m_machine;
223 }
224 
226 {
227  SG_REF(machine);
229  m_machine = machine;
230 }
231 
233 {
234  m_bags = new CDynamicObjectArray();
235  m_machine = NULL;
236  m_features = NULL;
237  m_combination_rule = NULL;
238  m_labels = NULL;
239  m_num_bags = 0;
240  m_bag_size = 0;
242  m_oob_indices = NULL;
243 }
244 
246 {
247  SG_REF(rule);
249  m_combination_rule = rule;
250 }
251 
253 {
255  return m_combination_rule;
256 }
257 
259 {
260  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
261  REQUIRE(m_bags->get_num_elements() > 0, "BaggingMachine is not trained!");
262 
265  output.zero();
266  else
267  output.set_const(NAN);
268 
269  /* TODO: add parallel support of applying the OOBs
270  only possible when add_subset is thread-safe
271  #pragma omp parallel for num_threads(parallel->get_num_threads())
272  */
273  for (index_t i = 0; i < m_bags->get_num_elements(); i++)
274  {
275  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
276  CDynamicArray<index_t>* current_oob
277  = dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));
278 
279  SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
280  m_features->add_subset(oob);
281 
282  CLabels* l = m->apply(m_features);
284  if (l!=NULL)
285  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
286  else
287  SG_ERROR("NULL returned by apply method\n");
288 
289  // assign the values in the matrix (NAN) that are in-bag!
290  for (index_t j = 0; j < oob.vlen; j++)
291  output(oob[j], i) = lv[j];
292 
294  SG_UNREF(current_oob);
295  SG_UNREF(m);
296  SG_UNREF(l);
297  }
298 
299  DynArray<index_t> idx;
300  for (index_t i = 0; i < m_features->get_num_vectors(); i++)
301  {
302  if (m_all_oob_idx[i])
303  idx.push_back(i);
304  }
305 
306  SGVector<float64_t> combined = m_combination_rule->combine(output);
308  for (int32_t i=0;i<lab.vlen;i++)
309  lab[i]=combined[idx.get_element(i)];
310 
311  CLabels* predicted = NULL;
312  switch (m_labels->get_label_type())
313  {
314  case LT_BINARY:
315  predicted = new CBinaryLabels(lab);
316  break;
317 
318  case LT_MULTICLASS:
319  predicted = new CMulticlassLabels(lab);
320  break;
321 
322  case LT_REGRESSION:
323  predicted = new CRegressionLabels(lab);
324  break;
325 
326  default:
327  SG_ERROR("Unsupported label type\n");
328  }
329 
331  float64_t res = eval->evaluate(predicted, m_labels);
333 
334  SG_UNREF(predicted);
335  return res;
336 }
337 
339 {
341  out_of_bag.set_const(true);
342 
343  // mark the ones that are in_bag
344  index_t oob_count = m_features->get_num_vectors();
345  for (index_t i = 0; i < in_bag.vlen; i++)
346  {
347  if (out_of_bag[in_bag[i]])
348  {
349  out_of_bag[in_bag[i]] = false;
350  oob_count--;
351  }
352  }
353 
355  // store the indicies of vectors that are out of the bag
356  for (index_t i = 0; i < out_of_bag.vlen; i++)
357  {
358  if (out_of_bag[i])
359  {
360  oob->push_back(i);
361  m_all_oob_idx[i] = true;
362  }
363  }
364 
365  return oob;
366 }
367 
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
T get_element(int32_t index) const
Definition: DynArray.h:142
void set_combination_rule(CCombinationRule *rule)
virtual ELabelType get_label_type() const =0
binary labels +1/-1
Definition: LabelTypes.h:18
Real Labels are real-valued labels.
CCombinationRule * m_combination_rule
CCombinationRule * get_combination_rule() const
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:714
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
virtual bool train_machine(CFeatures *data=NULL)
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual void set_machine_parameters(CMachine *m, SGVector< index_t > idx)
virtual int32_t get_bag_size() const
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)=0
virtual int32_t get_num_vectors() const =0
T * get_array() const
Definition: DynamicArray.h:408
CLabels * m_labels
Definition: Machine.h:361
void random(T min_value, T max_value)
Definition: SGVector.cpp:181
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
int32_t get_num_bags() const
int32_t get_num_elements() const
Definition: DynArray.h:130
CDynamicArray< index_t > * get_oob_indices(const SGVector< index_t > &in_bag)
SGVector< float64_t > apply_get_outputs(CFeatures *data)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
#define SG_REF(x)
Definition: SGObject.h:51
A generic learning machine interface.
Definition: Machine.h:143
SGVector< bool > m_all_oob_idx
CMachine * get_machine() const
virtual SGVector< float64_t > combine(const SGMatrix< float64_t > &ensemble_result) const =0
Multiclass Labels for multi-class classification.
index_t vlen
Definition: SGVector.h:494
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
void push_back(T element)
Definition: DynArray.h:254
virtual void set_machine(CMachine *machine)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:32
double float64_t
Definition: common.h:50
virtual void remove_subset()
Definition: Labels.cpp:49
CDynamicObjectArray * m_bags
virtual CLabels * get_labels()
Definition: Machine.cpp:76
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:39
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
void set_num_bags(int32_t num_bags)
CDynamicObjectArray * m_oob_indices
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
The class Features is the base class of all feature objects.
Definition: Features.h:68
CombinationRule abstract class The CombinationRule defines an interface to how to combine the classif...
T * get_array() const
Definition: DynArray.h:372
virtual bool train(CFeatures *data=NULL)
Definition: Machine.cpp:39
int32_t get_num_elements() const
Definition: DynamicArray.h:200
Binary Labels for binary classification.
Definition: BinaryLabels.h:37
CSGObject * get_element(int32_t index) const
#define SG_ADD(...)
Definition: SGObject.h:81
#define NAN
Definition: Math.cpp:26
Dense integer or floating point labels.
Definition: DenseLabels.h:35
virtual CBinaryLabels * apply_binary(CFeatures *data=NULL)
float64_t get_oob_error(CEvaluation *eval) const
virtual void set_labels(CLabels *lab)
Definition: Machine.cpp:65
Class Evaluation, a base class for other classes used to evaluate labels, e.g. accuracy of classifica...
Definition: Evaluation.h:40
void set_const(T const_elem)
Definition: SGVector.cpp:152
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
virtual void set_bag_size(int32_t bag_size)
virtual CLabels * apply(CFeatures *data=NULL)
Definition: Machine.cpp:152

SHOGUN Machine Learning Toolbox - Documentation