SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BaggingMachine.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Viktor Gal
8  * Copyright (C) 2013 Viktor Gal
9  */
10 
14 
15 using namespace shogun;
16 
18  : CMachine()
19 {
20  init();
21  register_parameters();
22 }
23 
25  : CMachine()
26 {
27  init();
28  register_parameters();
29 
30  set_labels(labels);
31 
32  SG_REF(features);
33  m_features = features;
34 }
35 
37 {
38  SG_UNREF(m_machine);
39  SG_UNREF(m_features);
40  SG_UNREF(m_combination_rule);
41  SG_UNREF(m_bags);
42  SG_UNREF(m_oob_indices);
43 }
44 
46 {
47  SGVector<float64_t> combined_vector = apply_get_outputs(data);
48 
49  CBinaryLabels* pred = new CBinaryLabels(combined_vector);
50  return pred;
51 }
52 
54 {
55  SGVector<float64_t> combined_vector = apply_get_outputs(data);
56 
57  CMulticlassLabels* pred = new CMulticlassLabels(combined_vector);
58  return pred;
59 }
60 
62 {
63  SGVector<float64_t> combined_vector = apply_get_outputs(data);
64 
65  CRegressionLabels* pred = new CRegressionLabels(combined_vector);
66 
67  return pred;
68 }
69 
71 {
72  ASSERT(data != NULL);
73  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
74  ASSERT(m_num_bags == m_bags->get_num_elements());
75 
76  SGMatrix<float64_t> output(data->get_num_vectors(), m_num_bags);
77  output.zero();
78 
79  #pragma omp parallel for num_threads(parallel->get_num_threads())
80  for (int32_t i = 0; i < m_num_bags; ++i)
81  {
82  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
83  CLabels* l = m->apply(data);
85  float64_t* bag_results = output.get_column_vector(i);
86  memcpy(bag_results, lv.vector, lv.vlen*sizeof(float64_t));
87 
88  SG_UNREF(l);
89  SG_UNREF(m);
90  }
91 
92  SGVector<float64_t> combined = m_combination_rule->combine(output);
93 
94  return combined;
95 }
96 
98 {
99  REQUIRE(m_machine != NULL, "Machine is not set!");
100  REQUIRE(m_bag_size > 0, "Bag size is not set!");
101  REQUIRE(m_num_bags > 0, "Number of bag is not set!");
102 
103  if (data)
104  {
105  SG_REF(data);
106  SG_UNREF(m_features);
107  m_features = data;
108 
109  ASSERT(m_features->get_num_vectors() == m_labels->get_num_labels());
110  }
111 
112  // bag size << number of feature vector
113  ASSERT(m_bag_size < m_features->get_num_vectors());
114 
115  // clear the array, if previously trained
116  m_bags->reset_array();
117 
118  // reset the oob index vector
119  m_all_oob_idx = SGVector<bool>(m_features->get_num_vectors());
120  m_all_oob_idx.zero();
121 
122  SG_UNREF(m_oob_indices);
123  m_oob_indices = new CDynamicObjectArray();
124 
125  /*
126  TODO: enable multi-threaded learning. This requires views support
127  on CFeatures
128  #pragma omp parallel for num_threads(parallel->get_num_threads())
129  */
130  for (int32_t i = 0; i < m_num_bags; ++i)
131  {
132  CMachine* c = dynamic_cast<CMachine*>(m_machine->clone());
133  ASSERT(c != NULL);
134  SGVector<index_t> idx(m_bag_size);
135  idx.random(0, m_features->get_num_vectors()-1);
136  m_labels->add_subset(idx);
137  /* TODO:
138  if it's a binary labeling ensure that
139  there's always samples of both classes
140  if ((m_labels->get_label_type() == LT_BINARY))
141  {
142  while (true) {
143  if (!m_labels->ensure_valid()) {
144  m_labels->remove_subset();
145  idx.random(0, m_features->get_num_vectors());
146  m_labels->add_subset(idx);
147  continue;
148  }
149  break;
150  }
151  }
152  */
153  m_features->add_subset(idx);
154  c->set_labels(m_labels);
155  c->train(m_features);
156  m_features->remove_subset();
158 
159  // get out of bag indexes
160  CDynamicArray<index_t>* oob = get_oob_indices(idx);
161  m_oob_indices->push_back(oob);
162 
163  // add trained machine to bag array
164  m_bags->append_element(c);
165  }
166 
167  return true;
168 }
169 
170 void CBaggingMachine::register_parameters()
171 {
172  SG_ADD((CSGObject**)&m_features, "features", "Train features for bagging",
174  SG_ADD(&m_num_bags, "num_bags", "Number of bags", MS_AVAILABLE);
175  SG_ADD(&m_bag_size, "bag_size", "Number of vectors per bag", MS_AVAILABLE);
176  SG_ADD((CSGObject**)&m_bags, "bags", "Bags array", MS_NOT_AVAILABLE);
177  SG_ADD((CSGObject**)&m_combination_rule, "combination_rule",
178  "Combination rule to use for aggregating", MS_AVAILABLE);
179  SG_ADD(&m_all_oob_idx, "all_oob_idx", "Indices of all oob vectors",
181  SG_ADD((CSGObject**)&m_oob_indices, "oob_indices",
182  "OOB indices for each machine", MS_NOT_AVAILABLE);
183 }
184 
185 void CBaggingMachine::set_num_bags(int32_t num_bags)
186 {
187  m_num_bags = num_bags;
188 }
189 
191 {
192  return m_num_bags;
193 }
194 
195 void CBaggingMachine::set_bag_size(int32_t bag_size)
196 {
197  m_bag_size = bag_size;
198 }
199 
201 {
202  return m_bag_size;
203 }
204 
206 {
207  SG_REF(m_machine);
208  return m_machine;
209 }
210 
212 {
213  SG_REF(machine);
214  SG_UNREF(m_machine);
215  m_machine = machine;
216 }
217 
218 void CBaggingMachine::init()
219 {
220  m_bags = new CDynamicObjectArray();
221  m_machine = NULL;
222  m_features = NULL;
223  m_combination_rule = NULL;
224  m_labels = NULL;
225  m_num_bags = 0;
226  m_bag_size = 0;
227  m_all_oob_idx = SGVector<bool>();
228  m_oob_indices = NULL;
229 }
230 
232 {
233  SG_REF(rule);
234  SG_UNREF(m_combination_rule);
235  m_combination_rule = rule;
236 }
237 
239 {
240  SG_REF(m_combination_rule);
241  return m_combination_rule;
242 }
243 
245 {
246  REQUIRE(m_combination_rule != NULL, "Combination rule is not set!");
247  REQUIRE(m_bags->get_num_elements() > 0, "BaggingMachine is not trained!");
248 
249  SGMatrix<float64_t> output(m_features->get_num_vectors(), m_bags->get_num_elements());
251  output.zero();
252  else
253  output.set_const(NAN);
254 
255  /* TODO: add parallel support of applying the OOBs
256  only possible when add_subset is thread-safe
257  #pragma omp parallel for num_threads(parallel->get_num_threads())
258  */
259  for (index_t i = 0; i < m_bags->get_num_elements(); i++)
260  {
261  CMachine* m = dynamic_cast<CMachine*>(m_bags->get_element(i));
262  CDynamicArray<index_t>* current_oob
263  = dynamic_cast<CDynamicArray<index_t>*>(m_oob_indices->get_element(i));
264 
265  SGVector<index_t> oob(current_oob->get_array(), current_oob->get_num_elements(), false);
266  oob.display_vector();
267  m_features->add_subset(oob);
268 
269  CLabels* l = m->apply(m_features);
271 
272  // assign the values in the matrix (NAN) that are in-bag!
273  for (index_t j = 0; j < oob.vlen; j++)
274  output(oob[j], i) = lv[j];
275 
276  m_features->remove_subset();
277  SG_UNREF(current_oob);
278  SG_UNREF(m);
279  SG_UNREF(l);
280  }
281  output.display_matrix();
282 
283  DynArray<index_t> idx;
284  for (index_t i = 0; i < m_features->get_num_vectors(); i++)
285  {
286  if (m_all_oob_idx[i])
287  idx.push_back(i);
288  }
289 
290  SGVector<float64_t> combined = m_combination_rule->combine(output);
291  CLabels* predicted = NULL;
292  switch (m_labels->get_label_type())
293  {
294  case LT_BINARY:
295  predicted = new CBinaryLabels(combined);
296  break;
297 
298  case LT_MULTICLASS:
299  predicted = new CMulticlassLabels(combined);
300  break;
301 
302  case LT_REGRESSION:
303  predicted = new CRegressionLabels(combined);
304  break;
305 
306  default:
307  SG_ERROR("Unsupported label type\n");
308  }
309 
311  float64_t res = eval->evaluate(predicted, m_labels);
313 
314  return res;
315 }
316 
317 CDynamicArray<index_t>* CBaggingMachine::get_oob_indices(const SGVector<index_t>& in_bag)
318 {
319  SGVector<bool> out_of_bag(m_features->get_num_vectors());
320  out_of_bag.set_const(true);
321 
322  // mark the ones that are in_bag
323  index_t oob_count = m_features->get_num_vectors();
324  for (index_t i = 0; i < in_bag.vlen; i++)
325  {
326  if (out_of_bag[in_bag[i]])
327  {
328  out_of_bag[in_bag[i]] = false;
329  oob_count--;
330  }
331  }
332 
334  // store the indicies of vectors that are out of the bag
335  for (index_t i = 0; i < out_of_bag.vlen; i++)
336  {
337  if (out_of_bag[i])
338  {
339  oob->push_back(i);
340  m_all_oob_idx[i] = true;
341  }
342  }
343 
344  return oob;
345 }
346 

SHOGUN Machine Learning Toolbox - Documentation