SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BaggingMachine.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Viktor Gal
8  * Copyright (C) 2013 Viktor Gal
9  */
10 
12 #include <shogun/base/Parameter.h>
13 
14 using namespace shogun;
15 
17  : CMachine()
18 {
19  init();
20  register_parameters();
21 }
22 
24  : CMachine()
25 {
26  init();
27  register_parameters();
28 
29  set_labels(labels);
30 
31  SG_REF(features);
32  m_features = features;
33 }
34 
36 {
37  SG_UNREF(m_machine);
38  SG_UNREF(m_features);
39  SG_UNREF(m_combination_rule);
40  SG_UNREF(m_bags);
41  SG_UNREF(m_oob_indices);
42 }
43 
45 {
46  SGVector<float64_t> combined_vector = apply_get_outputs(data);
47 
48  CBinaryLabels* pred = new CBinaryLabels(combined_vector);
49  return pred;
50 }
51 
53 {
54  SGVector<float64_t> combined_vector = apply_get_outputs(data);
55 
56  CMulticlassLabels* pred = new CMulticlassLabels(combined_vector);
57  return pred;
58 }
59 
61 {
62  SGVector<float64_t> combined_vector = apply_get_outputs(data);
63 
64  CRegressionLabels* pred = new CRegressionLabels(combined_vector);
65 
66  return pred;
67 }
68 
70 {
71  ASSERT(data != NULL);
72  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
73  ASSERT(m_num_bags == m_bags->get_num_elements());
74 
75  SGMatrix<float64_t> output(data->get_num_vectors(), m_num_bags);
76  output.zero();
77 
78  #pragma omp parallel for num_threads(parallel->get_num_threads())
79  for (int32_t i = 0; i < m_num_bags; ++i)
80  {
81  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
82  CLabels* l = m->apply(data);
84  float64_t* bag_results = output.get_column_vector(i);
85  memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));
86 
87  SG_UNREF(l);
88  SG_UNREF(m);
89  }
90 
91  SGVector<float64_t> combined = m_combination_rule->combine(output);
92 
93  return combined;
94 }
95 
97 {
98  REQUIRE(m_machine != NULL, "Machine is not set!");
99  REQUIRE(m_bag_size > 0, "Bag size is not set!");
100  REQUIRE(m_num_bags > 0, "Number of bag is not set!");
101 
102  if (data)
103  {
104  SG_REF(data);
105  SG_UNREF(m_features);
106  m_features = data;
107 
108  ASSERT(m_features->get_num_vectors() == m_labels->get_num_labels());
109  }
110 
111  // bag size << number of feature vector
112  ASSERT(m_bag_size < m_features->get_num_vectors());
113 
114  // clear the array, if previously trained
115  m_bags->reset_array();
116 
117  // reset the oob index vector
118  m_all_oob_idx = SGVector<bool>(m_features->get_num_vectors());
119  m_all_oob_idx.zero();
120 
121  SG_UNREF(m_oob_indices);
122  m_oob_indices = new CDynamicObjectArray();
123 
124  /*
125  TODO: enable multi-threaded learning. This requires views support
126  on CFeatures
127  #pragma omp parallel for num_threads(parallel->get_num_threads())
128  */
129  for (int32_t i = 0; i < m_num_bags; ++i)
130  {
131  CMachine* c = dynamic_cast<CMachine*>(m_machine->clone());
132  ASSERT(c != NULL);
133  SGVector<index_t> idx(m_bag_size);
134  idx.random(0, m_features->get_num_vectors()-1);
135  m_labels->add_subset(idx);
136  /* TODO:
137  if it's a binary labeling ensure that
138  there's always samples of both classes
139  if ((m_labels->get_label_type() == LT_BINARY))
140  {
141  while (true) {
142  if (!m_labels->ensure_valid()) {
143  m_labels->remove_subset();
144  idx.random(0, m_features->get_num_vectors());
145  m_labels->add_subset(idx);
146  continue;
147  }
148  break;
149  }
150  }
151  */
152  m_features->add_subset(idx);
153  c->set_labels(m_labels);
154  c->train(m_features);
155  m_features->remove_subset();
157 
158  // get out of bag indexes
159  CDynamicArray<index_t>* oob = get_oob_indices(idx);
160  m_oob_indices->push_back(oob);
161 
162  // add trained machine to bag array
163  m_bags->append_element(c);
164  }
165 
166  return true;
167 }
168 
169 void CBaggingMachine::register_parameters()
170 {
171  SG_ADD((CSGObject**)&m_features, "features", "Train features for bagging",
173  SG_ADD(&m_num_bags, "num_bags", "Number of bags", MS_AVAILABLE);
174  SG_ADD(&m_bag_size, "bag_size", "Number of vectors per bag", MS_AVAILABLE);
175  SG_ADD((CSGObject**)&m_bags, "bags", "Bags array", MS_NOT_AVAILABLE);
176  SG_ADD((CSGObject**)&m_combination_rule, "combination_rule",
177  "Combination rule to use for aggregating", MS_AVAILABLE);
178  SG_ADD(&m_all_oob_idx, "all_oob_idx", "Indices of all oob vectors",
180  SG_ADD((CSGObject**)&m_oob_indices, "oob_indices",
181  "OOB indices for each machine", MS_NOT_AVAILABLE);
182 }
183 
184 void CBaggingMachine::set_num_bags(int32_t num_bags)
185 {
186  m_num_bags = num_bags;
187 }
188 
190 {
191  return m_num_bags;
192 }
193 
194 void CBaggingMachine::set_bag_size(int32_t bag_size)
195 {
196  m_bag_size = bag_size;
197 }
198 
200 {
201  return m_bag_size;
202 }
203 
205 {
206  SG_REF(m_machine);
207  return m_machine;
208 }
209 
211 {
212  SG_REF(machine);
213  SG_UNREF(m_machine);
214  m_machine = machine;
215 }
216 
217 void CBaggingMachine::init()
218 {
219  m_bags = new CDynamicObjectArray();
220  m_machine = NULL;
221  m_features = NULL;
222  m_combination_rule = NULL;
223  m_labels = NULL;
224  m_num_bags = 0;
225  m_bag_size = 0;
226  m_all_oob_idx = SGVector<bool>();
227  m_oob_indices = NULL;
228 }
229 
231 {
232  SG_REF(rule);
233  SG_UNREF(m_combination_rule);
234  m_combination_rule = rule;
235 }
236 
238 {
239  SG_REF(m_combination_rule);
240  return m_combination_rule;
241 }
242 
244 {
245  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
246  REQUIRE(m_bags->get_num_elements() > 0, "BaggingMachine is not trained!");
247 
248  SGMatrix<float64_t> output(m_features->get_num_vectors(), m_bags->get_num_elements());
250  output.zero();
251  else
252  output.set_const(NAN);
253 
254  /* TODO: add parallel support of applying the OOBs
255  only possible when add_subset is thread-safe
256  #pragma omp parallel for num_threads(parallel->get_num_threads())
257  */
258  for (index_t i = 0; i < m_bags->get_num_elements(); i++)
259  {
260  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
261  CDynamicArray<index_t>* current_oob
262  = dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));
263 
264  SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
265  oob.display_vector();
266  m_features->add_subset(oob);
267 
268  CLabels* l = m->apply(m_features);
270 
271  // assign the values in the matrix (NAN) that are in-bag!
272  for (index_t j = 0; j < oob.vlen; j++)
273  output(oob[j], i) = lv[j];
274 
275  m_features->remove_subset();
276  SG_UNREF(current_oob);
277  SG_UNREF(m);
278  SG_UNREF(l);
279  }
280  output.display_matrix();
281 
282  DynArray<index_t> idx;
283  for (index_t i = 0; i < m_features->get_num_vectors(); i++)
284  {
285  if (m_all_oob_idx[i])
286  idx.push_back(i);
287  }
288 
289  SGVector<float64_t> combined = m_combination_rule->combine(output);
290  CLabels* predicted = NULL;
291  switch (m_labels->get_label_type())
292  {
293  case LT_BINARY:
294  predicted = new CBinaryLabels(combined);
295  break;
296 
297  case LT_MULTICLASS:
298  predicted = new CMulticlassLabels(combined);
299  break;
300 
301  case LT_REGRESSION:
302  predicted = new CRegressionLabels(combined);
303  break;
304 
305  default:
306  SG_ERROR("Unsupported label type\n");
307  }
308 
310  float64_t res = eval->evaluate(predicted, m_labels);
312 
313  return res;
314 }
315 
316 CDynamicArray<index_t>* CBaggingMachine::get_oob_indices(const SGVector<index_t>& in_bag)
317 {
318  SGVector<bool> out_of_bag(m_features->get_num_vectors());
319  out_of_bag.set_const(true);
320 
321  // mark the ones that are in_bag
322  index_t oob_count = m_features->get_num_vectors();
323  for (index_t i = 0; i < in_bag.vlen; i++)
324  {
325  if (out_of_bag[in_bag[i]])
326  {
327  out_of_bag[in_bag[i]] = false;
328  oob_count--;
329  }
330  }
331 
333  // store the indicies of vectors that are out of the bag
334  for (index_t i = 0; i < out_of_bag.vlen; i++)
335  {
336  if (out_of_bag[i])
337  {
338  oob->push_back(i);
339  m_all_oob_idx[i] = true;
340  }
341  }
342 
343  return oob;
344 }
345 

SHOGUN Machine Learning Toolbox - Documentation