SHOGUN  6.1.3
BaggingMachine.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Viktor Gal
8  * Copyright (C) 2013 Viktor Gal
9  */
10 
15 
17 
18 using namespace shogun;
19 
21  : CMachine()
22 {
23  init();
25 }
26 
28  : CMachine()
29 {
30  init();
32 
33  set_labels(labels);
34 
35  SG_REF(features);
36  m_features = features;
37 }
38 
40 {
46 }
47 
49 {
51 
52  CMeanRule* mean_rule = new CMeanRule();
53 
55  SGVector<float64_t> probabilities = mean_rule->combine(output);
56 
57  float64_t threshold = 0.5;
58  CBinaryLabels* pred = new CBinaryLabels(probabilities, threshold);
59 
60  SG_UNREF(mean_rule);
61 
62  return pred;
63 }
64 
66 {
67  SGMatrix<float64_t> bagged_outputs =
69 
70  REQUIRE(m_labels, "Labels not set.\n");
71  REQUIRE(
73  "Labels (%s) are not compatible with multiclass.\n",
74  m_labels->get_name());
75 
76  auto labels_multiclass = dynamic_cast<CMulticlassLabels*>(m_labels);
77  auto num_samples = bagged_outputs.size() / m_num_bags;
78  auto num_classes = labels_multiclass->get_num_classes();
79 
80  CMulticlassLabels* pred = new CMulticlassLabels(num_samples);
81  pred->allocate_confidences_for(num_classes);
82 
83  SGMatrix<float64_t> class_probabilities(num_classes, num_samples);
84  class_probabilities.zero();
85 
86  for (auto i = 0; i < num_samples; ++i)
87  {
88  for (auto j = 0; j < m_num_bags; ++j)
89  {
90  int32_t class_idx = bagged_outputs(i, j);
91  class_probabilities(class_idx, i) += 1;
92  }
93  }
94 
95  class_probabilities = linalg::scale(class_probabilities, 1.0 / m_num_bags);
96 
97  for (auto i = 0; i < num_samples; ++i)
98  pred->set_multiclass_confidences(i, class_probabilities.get_column(i));
99 
100  SGVector<float64_t> combined = m_combination_rule->combine(bagged_outputs);
101  pred->set_labels(combined);
102 
103  return pred;
104 }
105 
107 {
108  return new CRegressionLabels(apply_get_outputs(data));
109 }
110 
112 {
113  ASSERT(data != NULL);
114  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
115 
117  SGVector<float64_t> combined = m_combination_rule->combine(output);
118 
119  return combined;
120 }
121 
124 {
126 
128  output.zero();
129 
130  #pragma omp parallel for
131  for (int32_t i = 0; i < m_num_bags; ++i)
132  {
133  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
134  CLabels* l = m->apply(data);
136  if (l!=NULL)
137  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
138  else
139  SG_ERROR("NULL returned by apply method\n");
140 
141  float64_t* bag_results = output.get_column_vector(i);
142  sg_memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));
143 
144  SG_UNREF(l);
145  SG_UNREF(m);
146  }
147 
148  return output;
149 }
150 
152 {
153  REQUIRE(m_machine != NULL, "Machine is not set!");
154  REQUIRE(m_num_bags > 0, "Number of bag is not set!");
155 
156  if (data)
157  {
158  SG_REF(data);
160  m_features = data;
161 
163  }
164 
165  // if bag size is not provided, set it equal to number of training vectors
166  if (m_bag_size==0)
168 
169  // clear the array, if previously trained
170  m_bags->reset_array();
171 
172  // reset the oob index vector
175 
178 
179  SGMatrix<index_t> rnd_indicies(m_bag_size, m_num_bags);
180  for (index_t i = 0; i < m_num_bags*m_bag_size; ++i)
181  rnd_indicies.matrix[i] = CMath::random(0, m_bag_size-1);
182 
183  #pragma omp parallel for
184  for (int32_t i = 0; i < m_num_bags; ++i)
185  {
186  CMachine* c=dynamic_cast<CMachine*>(m_machine->clone());
187  ASSERT(c != NULL);
188  SGVector<index_t> idx(rnd_indicies.get_column_vector(i), m_bag_size, false);
189 
190  CFeatures* features;
191  CLabels* labels;
192 
194  {
195  features = m_features;
196  labels = m_labels;
197  }
198  else
199  {
200  features = m_features->shallow_subset_copy();
201  labels = m_labels->shallow_subset_copy();
202  }
203 
204  labels->add_subset(idx);
205  /* TODO:
206  if it's a binary labeling ensure that
207  there's always samples of both classes
208  if ((m_labels->get_label_type() == LT_BINARY))
209  {
210  while (true) {
211  if (!m_labels->ensure_valid()) {
212  m_labels->remove_subset();
213  idx.random(0, m_features->get_num_vectors());
214  m_labels->add_subset(idx);
215  continue;
216  }
217  break;
218  }
219  }
220  */
221  features->add_subset(idx);
222  set_machine_parameters(c,idx);
223  c->set_labels(labels);
224  c->train(features);
225  features->remove_subset();
226  labels->remove_subset();
227 
228  #pragma omp critical
229  {
230  // get out of bag indexes
232  m_oob_indices->push_back(oob);
233 
234  // add trained machine to bag array
235  m_bags->push_back(c);
236  }
237 
238  if (get_global_parallel()->get_num_threads()!=1)
239  {
240  SG_UNREF(features);
241  SG_UNREF(labels);
242  }
243 
244  SG_UNREF(c);
245  }
246 
247  return true;
248 }
249 
251 {
252 }
253 
255 {
256  SG_ADD((CSGObject**)&m_features, "features", "Train features for bagging",
258  SG_ADD(&m_num_bags, "num_bags", "Number of bags", MS_AVAILABLE);
259  SG_ADD(&m_bag_size, "bag_size", "Number of vectors per bag", MS_AVAILABLE);
260  SG_ADD((CSGObject**)&m_bags, "bags", "Bags array", MS_NOT_AVAILABLE);
261  SG_ADD((CSGObject**)&m_combination_rule, "combination_rule",
262  "Combination rule to use for aggregating", MS_AVAILABLE);
263  SG_ADD(&m_all_oob_idx, "all_oob_idx", "Indices of all oob vectors",
265  SG_ADD((CSGObject**)&m_oob_indices, "oob_indices",
266  "OOB indices for each machine", MS_NOT_AVAILABLE);
267 }
268 
269 void CBaggingMachine::set_num_bags(int32_t num_bags)
270 {
271  m_num_bags = num_bags;
272 }
273 
275 {
276  return m_num_bags;
277 }
278 
279 void CBaggingMachine::set_bag_size(int32_t bag_size)
280 {
281  m_bag_size = bag_size;
282 }
283 
285 {
286  return m_bag_size;
287 }
288 
290 {
291  SG_REF(m_machine);
292  return m_machine;
293 }
294 
296 {
297  SG_REF(machine);
299  m_machine = machine;
300 }
301 
303 {
304  m_bags = new CDynamicObjectArray();
305  m_machine = NULL;
306  m_features = NULL;
307  m_combination_rule = NULL;
308  m_labels = NULL;
309  m_num_bags = 0;
310  m_bag_size = 0;
312  m_oob_indices = NULL;
313 }
314 
316 {
317  SG_REF(rule);
319  m_combination_rule = rule;
320 }
321 
323 {
325  return m_combination_rule;
326 }
327 
329 {
330  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
331  REQUIRE(m_bags->get_num_elements() > 0, "BaggingMachine is not trained!");
332 
335  output.zero();
336  else
337  output.set_const(NAN);
338 
339  /* TODO: add parallel support of applying the OOBs
340  only possible when add_subset is thread-safe
341  #pragma omp parallel for num_threads(parallel->get_num_threads())
342  */
343  for (index_t i = 0; i < m_bags->get_num_elements(); i++)
344  {
345  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
346  CDynamicArray<index_t>* current_oob
347  = dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));
348 
349  SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
350  m_features->add_subset(oob);
351 
352  CLabels* l = m->apply(m_features);
354  if (l!=NULL)
355  lv = dynamic_cast<CDenseLabels*>(l)->get_labels();
356  else
357  SG_ERROR("NULL returned by apply method\n");
358 
359  // assign the values in the matrix (NAN) that are in-bag!
360  for (index_t j = 0; j < oob.vlen; j++)
361  output(oob[j], i) = lv[j];
362 
364  SG_UNREF(current_oob);
365  SG_UNREF(m);
366  SG_UNREF(l);
367  }
368 
369  std::vector<index_t> idx;
370  for (index_t i = 0; i < m_features->get_num_vectors(); i++)
371  {
372  if (m_all_oob_idx[i])
373  idx.push_back(i);
374  }
375 
376  SGVector<float64_t> combined = m_combination_rule->combine(output);
377  SGVector<float64_t> lab(idx.size());
378  for (int32_t i=0;i<lab.vlen;i++)
379  lab[i]=combined[idx[i]];
380 
381  CLabels* predicted = NULL;
382  switch (m_labels->get_label_type())
383  {
384  case LT_BINARY:
385  predicted = new CBinaryLabels(lab);
386  break;
387 
388  case LT_MULTICLASS:
389  predicted = new CMulticlassLabels(lab);
390  break;
391 
392  case LT_REGRESSION:
393  predicted = new CRegressionLabels(lab);
394  break;
395 
396  default:
397  SG_ERROR("Unsupported label type\n");
398  }
399 
400  m_labels->add_subset(SGVector<index_t>(idx.data(), idx.size(), false));
401  float64_t res = eval->evaluate(predicted, m_labels);
403 
404  SG_UNREF(predicted);
405  return res;
406 }
407 
409 {
411  out_of_bag.set_const(true);
412 
413  // mark the ones that are in_bag
414  for (index_t i = 0; i < in_bag.vlen; i++)
415  out_of_bag[in_bag[i]] &= false;
416 
418  // store the indicies of vectors that are out of the bag
419  for (index_t i = 0; i < out_of_bag.vlen; i++)
420  {
421  if (out_of_bag[i])
422  {
423  oob->push_back(i);
424  m_all_oob_idx[i] = true;
425  }
426  }
427 
428  return oob;
429 }
430 
void allocate_confidences_for(int32_t n_classes)
virtual const char * get_name() const =0
virtual CFeatures * shallow_subset_copy()
Definition: Features.h:353
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
Parallel * get_global_parallel()
Definition: SGObject.cpp:311
void set_combination_rule(CCombinationRule *rule)
virtual ELabelType get_label_type() const =0
binary labels +1/-1
Definition: LabelTypes.h:18
Real Labels are real-valued labels.
virtual CLabels * shallow_subset_copy()
Definition: Labels.h:132
SGVector< T > get_column(index_t col) const
Definition: SGMatrix.cpp:399
CCombinationRule * m_combination_rule
int32_t get_num_threads() const
Definition: Parallel.cpp:97
CCombinationRule * get_combination_rule() const
int32_t index_t
Definition: common.h:72
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:734
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
virtual bool train_machine(CFeatures *data=NULL)
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual void set_machine_parameters(CMachine *m, SGVector< index_t > idx)
void scale(SGVector< T > &a, SGVector< T > &result, T alpha=1)
virtual int32_t get_bag_size() const
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)=0
virtual int32_t get_num_vectors() const =0
CLabels * m_labels
Definition: Machine.h:436
int32_t get_num_elements() const
Definition: DynamicArray.h:200
#define SG_ERROR(...)
Definition: SGIO.h:128
#define REQUIRE(x,...)
Definition: SGIO.h:181
int32_t get_num_bags() const
CDynamicArray< index_t > * get_oob_indices(const SGVector< index_t > &in_bag)
SGVector< float64_t > apply_get_outputs(CFeatures *data)
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
#define SG_REF(x)
Definition: SGObject.h:52
A generic learning machine interface.
Definition: Machine.h:151
static uint64_t random()
Definition: Math.h:811
SGVector< bool > m_all_oob_idx
CMachine * get_machine() const
virtual SGVector< float64_t > combine(const SGMatrix< float64_t > &ensemble_result) const =0
T * get_array() const
Definition: DynamicArray.h:408
Multiclass Labels for multi-class classification.
#define ASSERT(x)
Definition: SGIO.h:176
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:124
virtual void set_machine(CMachine *machine)
CMeanRule simply averages the outputs of the Machines in the ensemble.
Definition: MeanRule.h:23
double float64_t
Definition: common.h:60
virtual void remove_subset()
Definition: Labels.cpp:51
CDynamicObjectArray * m_bags
virtual CLabels * get_labels()
Definition: Machine.cpp:83
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:41
SGMatrix< float64_t > apply_outputs_without_combination(CFeatures *data)
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
virtual SGVector< float64_t > combine(const SGMatrix< float64_t > &ensemble_result) const
Definition: MeanRule.cpp:28
void set_const(T const_elem)
Definition: SGVector.cpp:199
void set_num_bags(int32_t num_bags)
void set_multiclass_confidences(int32_t i, SGVector< float64_t > confidences)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22
CDynamicObjectArray * m_oob_indices
#define SG_UNREF(x)
Definition: SGObject.h:53
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
The class Features is the base class of all feature objects.
Definition: Features.h:69
CombinationRule abstract class The CombinationRule defines an interface to how to combine the classif...
virtual bool train(CFeatures *data=NULL)
Definition: Machine.cpp:43
Binary Labels for binary classification.
Definition: BinaryLabels.h:37
CSGObject * get_element(int32_t index) const
#define SG_ADD(...)
Definition: SGObject.h:93
#define NAN
Definition: Math.cpp:20
Dense integer or floating point labels.
Definition: DenseLabels.h:35
virtual CBinaryLabels * apply_binary(CFeatures *data=NULL)
float64_t get_oob_error(CEvaluation *eval) const
virtual void set_labels(CLabels *lab)
Definition: Machine.cpp:72
Class Evaluation, a base class for other classes used to evaluate labels, e.g. accuracy of classifica...
Definition: Evaluation.h:40
T * get_column_vector(index_t col) const
Definition: SGMatrix.h:144
void set_labels(SGVector< float64_t > v)
Definition: DenseLabels.cpp:77
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
int64_t size() const
Definition: SGMatrix.h:275
index_t vlen
Definition: SGVector.h:571
virtual void set_bag_size(int32_t bag_size)
virtual CLabels * apply(CFeatures *data=NULL)
Definition: Machine.cpp:159

SHOGUN Machine Learning Toolbox - Documentation