SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
RandomCARTree.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Parijat Mazumdar
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
33 
34 using namespace shogun;
35 
37 : CCARTree()
38 {
39  init();
40 }
41 
43 {
44 }
45 
47 {
48  REQUIRE(size>0, "Subset size should be greater than 0. %d supplied!\n",size)
49  m_randsubset_size=size;
50 }
51 
53  SGVector<float64_t> left, SGVector<float64_t> right, SGVector<bool> is_left_final, int32_t &num_missing_final, int32_t &count_left,
54  int32_t &count_right)
55 {
56  int32_t num_vecs=mat.num_cols;
57  int32_t num_feats=mat.num_rows;
58 
59  int32_t n_ulabels;
60  SGVector<float64_t> ulabels=get_unique_labels(labels_vec,n_ulabels);
61 
62  // if all labels same early stop
63  if (n_ulabels==1)
64  return -1;
65 
66  SGVector<float64_t> total_wclasses(n_ulabels);
67  total_wclasses.zero();
68 
69  SGVector<int32_t> simple_labels(num_vecs);
70  float64_t delta=0;
71  if (m_mode==PT_REGRESSION)
72  delta=m_label_epsilon;
73 
74  for (int32_t i=0;i<num_vecs;i++)
75  {
76  for (int32_t j=0;j<n_ulabels;j++)
77  {
78  if (CMath::abs(labels_vec[i]-ulabels[j])<=delta)
79  {
80  simple_labels[i]=j;
81  total_wclasses[j]+=weights[i];
82  break;
83  }
84 
85  }
86  }
87 
88  REQUIRE(m_randsubset_size<=num_feats, "The Feature subset size(set %d) should be less than"
89  " or equal to the total number of features(%d here)\n",m_randsubset_size,num_feats)
90 
91  // if subset size is not set choose sqrt(num_feats) by default
92  if (m_randsubset_size==0)
93  m_randsubset_size=CMath::sqrt(num_feats-0.f);
94 
95  // randomly choose w/o replacement the attributes from which best will be chosen
96  // randomly permute and choose 1st randsubset_size elements
97  SGVector<index_t> idx(num_feats);
98  idx.range_fill();
99  CMath::permute(idx);
100 
101  float64_t max_gain=MIN_SPLIT_GAIN;
102  int32_t best_attribute=-1;
103  float64_t best_threshold=0;
104  for (int32_t i=0;i<m_randsubset_size;i++)
105  {
106  SGVector<float64_t> feats(num_vecs);
107  for (int32_t j=0;j<num_vecs;j++)
108  feats[j]=mat(idx[i],j);
109 
110  // O(N*logN)
111  SGVector<index_t> sorted_args=CMath::argsort(feats);
112 
113  // number of non-missing vecs
114  int32_t n_nm_vecs=feats.vlen;
115  while (feats[sorted_args[n_nm_vecs-1]]==MISSING)
116  {
117  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]-=weights[sorted_args[n_nm_vecs-1]];
118  n_nm_vecs--;
119  }
120 
121  // if only one unique value - it cannot be used to split
122  if (feats[sorted_args[n_nm_vecs-1]]<=feats[sorted_args[0]]+EQ_DELTA)
123  continue;
124 
125  if (m_nominal[idx[i]])
126  {
127  SGVector<int32_t> simple_feats(num_vecs);
128  simple_feats.fill_vector(simple_feats.vector,simple_feats.vlen,-1);
129 
130  // convert to simple values
131  simple_feats[sorted_args[0]]=0;
132  int32_t c=0;
133  for (int32_t j=1;j<n_nm_vecs;j++)
134  {
135  if (feats[sorted_args[j]]==feats[sorted_args[j-1]])
136  simple_feats[sorted_args[j]]=c;
137  else
138  simple_feats[sorted_args[j]]=(++c);
139  }
140 
141  SGVector<float64_t> ufeats(c+1);
142  ufeats[0]=feats[sorted_args[0]];
143  int32_t u=0;
144  for (int32_t j=1;j<n_nm_vecs;j++)
145  {
146  if (feats[sorted_args[j]]==feats[sorted_args[j-1]])
147  continue;
148  else
149  ufeats[++u]=feats[sorted_args[j]];
150  }
151 
152  // test all 2^(I-1)-1 possible division between two nodes
153  int32_t num_cases=CMath::pow(2,c);
154  for (int32_t k=1;k<num_cases;k++)
155  {
156  SGVector<float64_t> wleft(n_ulabels);
157  SGVector<float64_t> wright(n_ulabels);
158  wleft.zero();
159  wright.zero();
160 
161  // stores which vectors are assigned to left child
162  SGVector<bool> is_left(num_vecs);
163  is_left.fill_vector(is_left.vector,is_left.vlen,false);
164 
165  // stores which among the categorical values of chosen attribute are assigned left child
166  SGVector<bool> feats_left(c+1);
167 
168  // fill feats_left in a unique way corresponding to the case
169  for (int32_t p=0;p<c+1;p++)
170  feats_left[p]=((k/CMath::pow(2,p))%(CMath::pow(2,p+1))==1);
171 
172  // form is_left
173  for (int32_t j=0;j<n_nm_vecs;j++)
174  {
175  is_left[sorted_args[j]]=feats_left[simple_feats[sorted_args[j]]];
176  if (is_left[sorted_args[j]])
177  wleft[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
178  else
179  wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
180  }
181 
182  float64_t g=0;
183  if (m_mode==PT_MULTICLASS)
184  g=gain(wleft,wright,total_wclasses);
185  else if (m_mode==PT_REGRESSION)
186  g=gain(wleft,wright,total_wclasses,ulabels);
187  else
188  SG_ERROR("Undefined problem statement\n");
189 
190  if (g>max_gain)
191  {
192  best_attribute=idx[i];
193  max_gain=g;
194  memcpy(is_left_final.vector,is_left.vector,is_left.vlen*sizeof(bool));
195  num_missing_final=num_vecs-n_nm_vecs;
196 
197  count_left=0;
198  for (int32_t l=0;l<c+1;l++)
199  count_left=(feats_left[l])?count_left+1:count_left;
200 
201  count_right=c+1-count_left;
202 
203  int32_t l=0;
204  int32_t r=0;
205  for (int32_t w=0;w<c+1;w++)
206  {
207  if (feats_left[w])
208  left[l++]=ufeats[w];
209  else
210  right[r++]=ufeats[w];
211  }
212  }
213  }
214  }
215  else
216  {
217  // O(N)
218  SGVector<float64_t> right_wclasses=total_wclasses.clone();
219  SGVector<float64_t> left_wclasses(n_ulabels);
220  left_wclasses.zero();
221 
222  // O(N)
223  // find best split for non-nominal attribute - choose threshold (z)
224  float64_t z=feats[sorted_args[0]];
225  right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]];
226  left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]];
227  for (int32_t j=1;j<n_nm_vecs;j++)
228  {
229  if (feats[sorted_args[j]]<=z+EQ_DELTA)
230  {
231  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
232  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
233  continue;
234  }
235 
236  // O(F)
237  float64_t g=0;
238  if (m_mode==PT_MULTICLASS)
239  g=gain(left_wclasses,right_wclasses,total_wclasses);
240  else if (m_mode==PT_REGRESSION)
241  g=gain(left_wclasses,right_wclasses,total_wclasses,ulabels);
242  else
243  SG_ERROR("Undefined problem statement\n");
244 
245  if (g>max_gain)
246  {
247  max_gain=g;
248  best_attribute=idx[i];
249  best_threshold=z;
250  num_missing_final=num_vecs-n_nm_vecs;
251  }
252 
253  z=feats[sorted_args[j]];
254  if (feats[sorted_args[n_nm_vecs-1]]<=z+EQ_DELTA)
255  break;
256 
257  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
258  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
259  }
260  }
261 
262  // restore total_wclasses
263  while (n_nm_vecs<feats.vlen)
264  {
265  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]+=weights[sorted_args[n_nm_vecs-1]];
266  n_nm_vecs++;
267  }
268  }
269 
270  if (best_attribute==-1)
271  return -1;
272 
273  if (!m_nominal[best_attribute])
274  {
275  left[0]=best_threshold;
276  right[0]=best_threshold;
277  count_left=1;
278  count_right=1;
279  for (int32_t i=0;i<num_vecs;i++)
280  is_left_final[i]=(mat(best_attribute,i)<=best_threshold);
281  }
282 
283  return best_attribute;
284 }
285 
286 void CRandomCARTree::init()
287 {
288  m_randsubset_size=0;
289 
290  SG_ADD(&m_randsubset_size,"m_randsubset_size", "random features subset size", MS_NOT_AVAILABLE);
291 }
void set_feature_subset_size(int32_t size)
void range_fill(T start=0)
Definition: SGVector.cpp:173
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:223
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:378
static const float64_t EQ_DELTA
Definition: CARTree.h:415
float64_t m_label_epsilon
Definition: CARTree.h:419
index_t num_rows
Definition: SGMatrix.h:376
index_t vlen
Definition: SGVector.h:494
float64_t gain(SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
Definition: CARTree.cpp:918
double float64_t
Definition: common.h:50
static SGVector< index_t > argsort(SGVector< T > vector)
Definition: Math.h:1599
This class implements the Classification And Regression Trees algorithm by Breiman et al for decision...
Definition: CARTree.h:79
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
SGVector< bool > m_nominal
Definition: CARTree.h:422
static const float64_t MIN_SPLIT_GAIN
Definition: CARTree.h:412
virtual int32_t compute_best_attribute(SGMatrix< float64_t > mat, SGVector< float64_t > weights, SGVector< float64_t > labels_vec, SGVector< float64_t > left, SGVector< float64_t > right, SGVector< bool > is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right)
SGVector< T > clone() const
Definition: SGVector.cpp:209
#define SG_ADD(...)
Definition: SGObject.h:81
static float32_t sqrt(float32_t x)
Definition: Math.h:459
#define delta
Definition: sfa.cpp:23
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
SGVector< float64_t > get_unique_labels(SGVector< float64_t > labels_vec, int32_t &n_ulabels)
Definition: CARTree.cpp:462
static T abs(T a)
Definition: Math.h:179
static const float64_t MISSING
Definition: CARTree.h:409
EProblemType m_mode
Definition: CARTree.h:440

SHOGUN Machine Learning Toolbox - Documentation