SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ECOCDiscriminantEncoder.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2012 Chiyuan Zhang
8  * Copyright (C) 2012 Chiyuan Zhang
9  */
10 
11 #include <algorithm>
12 
17 
18 using namespace std;
19 using namespace shogun;
20 
21 CECOCDiscriminantEncoder::CECOCDiscriminantEncoder()
22 {
23  init();
24 }
25 
26 CECOCDiscriminantEncoder::~CECOCDiscriminantEncoder()
27 {
28  SG_UNREF(m_features);
29  SG_UNREF(m_labels);
30 }
31 
32 void CECOCDiscriminantEncoder::init()
33 {
34  // default parameters
35  m_iterations = 25;
36  m_num_trees = 1;
37 
38  // init values
39  m_features = NULL;
40  m_labels = NULL;
41 
42  // parameters
43 
44  SG_ADD(&m_iterations, "iterations", "number of iterations in SFFS", MS_NOT_AVAILABLE);
45 }
46 
47 void CECOCDiscriminantEncoder::set_features(CDenseFeatures<float64_t> *features)
48 {
49  SG_REF(features);
50  SG_UNREF(m_features);
51  m_features = features;
52 }
53 
54 void CECOCDiscriminantEncoder::set_labels(CLabels *labels)
55 {
56  SG_REF(labels);
57  SG_UNREF(m_labels);
58  m_labels = labels;
59 }
60 
61 SGMatrix<int32_t> CECOCDiscriminantEncoder::create_codebook(int32_t num_classes)
62 {
63  if (!m_features || !m_labels)
64  SG_ERROR("Need features and labels to learn the codebook");
65 
66  m_feats = m_features->get_feature_matrix();
67  m_codebook = SGMatrix<int32_t>(m_num_trees * (num_classes-1), num_classes);
68  m_codebook.zero();
69  m_code_idx = 0;
70 
71  for (int32_t itree = 0; itree < m_num_trees; ++itree)
72  {
73  vector<int32_t> classes(num_classes);
74  for (int32_t i=0; i < num_classes; ++i)
75  classes[i] = i;
76 
77  binary_partition(classes);
78  }
79 
80  m_feats = SGMatrix<float64_t>(); // release memory
81  return m_codebook;
82 }
83 
84 void CECOCDiscriminantEncoder::binary_partition(const vector<int32_t>& classes)
85 {
86  if (classes.size() > 2)
87  {
88  int32_t isplit = classes.size()/2;
89  vector<int32_t> part1(classes.begin(), classes.begin()+isplit);
90  vector<int32_t> part2(classes.begin()+isplit, classes.end());
91  run_sffs(part1, part2);
92  for (size_t i=0; i < part1.size(); ++i)
93  m_codebook(m_code_idx, part1[i]) = +1;
94  for (size_t i=0; i < part2.size(); ++i)
95  m_codebook(m_code_idx, part2[i]) = -1;
96  m_code_idx++;
97 
98  if (part1.size() > 1)
99  binary_partition(part1);
100  if (part2.size() > 1)
101  binary_partition(part2);
102  }
103  else // only two classes
104  {
105  m_codebook(m_code_idx, classes[0]) = +1;
106  m_codebook(m_code_idx, classes[1]) = -1;
107  m_code_idx++;
108  }
109 }
110 
111 void CECOCDiscriminantEncoder::run_sffs(vector<int32_t>& part1, vector<int32_t>& part2)
112 {
113  set<int32_t> idata1;
114  set<int32_t> idata2;
115 
116  for (int32_t i=0; i < m_labels->get_num_labels(); ++i)
117  {
118  if (find(part1.begin(), part1.end(), ((CMulticlassLabels*) m_labels)->get_int_label(i)) != part1.end())
119  idata1.insert(i);
120  else if (find(part2.begin(), part2.end(), ((CMulticlassLabels*) m_labels)->get_int_label(i)) != part2.end())
121  idata2.insert(i);
122  }
123 
124  float64_t MI = compute_MI(idata1, idata2);
125  for (int32_t i=0; i < m_iterations; ++i)
126  {
127  if (i % 2 == 0)
128  MI = sffs_iteration(MI, part1, idata1, part2, idata2);
129  else
130  MI = sffs_iteration(MI, part2, idata2, part1, idata1);
131  }
132 }
133 
134 float64_t CECOCDiscriminantEncoder::sffs_iteration(float64_t MI, vector<int32_t>& part1, set<int32_t>& idata1,
135  vector<int32_t>& part2, set<int32_t>& idata2)
136 {
137  if (part1.size() <= 1)
138  return MI;
139 
140  int32_t iclas = CMath::random(0, int32_t(part1.size()-1));
141  int32_t clas = part1[iclas];
142 
143  // move clas from part1 to part2
144  for (int32_t i=0; i < m_labels->get_num_labels(); ++i)
145  {
146  if (((CMulticlassLabels*) m_labels)->get_int_label(i) == clas)
147  {
148  idata1.erase(i);
149  idata2.insert(i);
150  }
151  }
152 
153  float64_t new_MI = compute_MI(idata1, idata2);
154  if (new_MI < MI)
155  {
156  part2.push_back(clas);
157  part1.erase(part1.begin() + iclas);
158  return new_MI;
159  }
160  else
161  {
162  // revert changes
163  for (int32_t i=0; i < m_labels->get_num_labels(); ++i)
164  {
165  if (((CMulticlassLabels*) m_labels)->get_int_label(i) == clas)
166  {
167  idata2.erase(i);
168  idata1.insert(i);
169  }
170  }
171  return MI;
172  }
173 
174 }
175 
176 float64_t CECOCDiscriminantEncoder::compute_MI(const set<int32_t>& idata1, const set<int32_t>& idata2)
177 {
178  float64_t MI = 0;
179 
180  int32_t hist1[10];
181  int32_t hist2[10];
182 
183  for (int32_t i=0; i < m_feats.num_rows; ++i)
184  {
185  float64_t max_val = m_feats(i, 0);
186  float64_t min_val = m_feats(i, 0);
187  for (int32_t j=1; j < m_feats.num_cols; ++j)
188  {
189  max_val = max(max_val, m_feats(i, j));
190  min_val = min(min_val, m_feats(i, j));
191  }
192 
193  if (max_val - min_val < 1e-10)
194  max_val = min_val + 1; // avoid divide by zero error
195 
196  compute_hist(i, max_val, min_val, idata1, hist1);
197  compute_hist(i, max_val, min_val, idata2, hist2);
198 
199  float64_t MI_i = 0;
200  for (int j=0; j < 10; ++j)
201  MI_i += (hist1[j]-hist2[j])*(hist1[j]-hist2[j]);
202  MI += CMath::sqrt(MI_i);
203  }
204 
205  return MI;
206 }
207 
208 void CECOCDiscriminantEncoder::compute_hist(int32_t i, float64_t max_val, float64_t min_val,
209  const set<int32_t>& idata, int32_t *hist)
210 {
211  // hist of 0:0.1:1
212  fill(hist, hist+10, 0);
213 
214  for (set<int32_t>::const_iterator it = idata.begin(); it != idata.end(); ++it)
215  {
216  float64_t val = (m_feats(i, *it) - min_val) / (max_val - min_val);
217  int32_t pos = min(9, static_cast<int32_t>(val*10));
218  hist[pos]++;
219  }
220 }

SHOGUN Machine Learning Toolbox - Documentation