SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
CARTree.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Parijat Mazumdar
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
35 
36 using namespace Eigen;
37 using namespace shogun;
38 
39 const float64_t CCARTree::MISSING=CMath::MAX_REAL_NUMBER;
40 const float64_t CCARTree::EQ_DELTA=1e-7;
41 const float64_t CCARTree::MIN_SPLIT_GAIN=1e-7;
42 
43 CCARTree::CCARTree()
45 {
46  init();
47 }
48 
49 CCARTree::CCARTree(SGVector<bool> attribute_types, EProblemType prob_type)
51 {
52  init();
53  set_feature_types(attribute_types);
54  set_machine_problem_type(prob_type);
55 }
56 
57 CCARTree::CCARTree(SGVector<bool> attribute_types, EProblemType prob_type, int32_t num_folds, bool cv_prune)
59 {
60  init();
61  set_feature_types(attribute_types);
62  set_machine_problem_type(prob_type);
63  set_num_folds(num_folds);
64  if (cv_prune)
66 }
67 
69 {
71 }
72 
74 {
75  if (lab->get_label_type()==LT_MULTICLASS)
77  else if (lab->get_label_type()==LT_REGRESSION)
79  else
80  SG_ERROR("label type supplied is not supported\n")
81 
82  SG_REF(lab);
84  m_labels=lab;
85 }
86 
88 {
89  m_mode=mode;
90 }
91 
93 {
95  return true;
96  else if (m_mode==PT_REGRESSION && lab->get_label_type()==LT_REGRESSION)
97  return true;
98  else
99  return false;
100 }
101 
103 {
104  REQUIRE(data, "Data required for classification in apply_multiclass\n")
105 
106  // apply multiclass starting from root
107  bnode_t* current=dynamic_cast<bnode_t*>(get_root());
108 
109  REQUIRE(current, "Tree machine not yet trained.\n");
110  CLabels* ret=apply_from_current_node(dynamic_cast<CDenseFeatures<float64_t>*>(data), current);
111 
112  SG_UNREF(current);
113  return dynamic_cast<CMulticlassLabels*>(ret);
114 }
115 
117 {
118  REQUIRE(data, "Data required for classification in apply_multiclass\n")
119 
120  // apply regression starting from root
121  bnode_t* current=dynamic_cast<bnode_t*>(get_root());
122  CLabels* ret=apply_from_current_node(dynamic_cast<CDenseFeatures<float64_t>*>(data), current);
123 
124  SG_UNREF(current);
125  return dynamic_cast<CRegressionLabels*>(ret);
126 }
127 
129 {
130  if (weights.vlen==0)
131  {
132  weights=SGVector<float64_t>(feats->get_num_vectors());
133  weights.fill_vector(weights.vector,weights.vlen,1);
134  }
135 
136  CDynamicObjectArray* pruned_trees=prune_tree(this);
137 
138  int32_t min_index=0;
140  for (int32_t i=0;i<m_alphas->get_num_elements();i++)
141  {
142  CSGObject* element=pruned_trees->get_element(i);
143  bnode_t* root=NULL;
144  if (element!=NULL)
145  root=dynamic_cast<bnode_t*>(element);
146  else
147  SG_ERROR("%d element is NULL\n",i);
148 
149  CLabels* labels=apply_from_current_node(feats, root);
150  float64_t error=compute_error(labels,gnd_truth,weights);
151  if (error<min_error)
152  {
153  min_index=i;
154  min_error=error;
155  }
156 
157  SG_UNREF(labels);
158  SG_UNREF(element);
159  }
160 
161  CSGObject* element=pruned_trees->get_element(min_index);
162  bnode_t* root=NULL;
163  if (element!=NULL)
164  root=dynamic_cast<bnode_t*>(element);
165  else
166  SG_ERROR("%d element is NULL\n",min_index);
167 
168  this->set_root(root);
169 
170  SG_UNREF(pruned_trees);
171  SG_UNREF(element);
172 }
173 
175 {
176  m_weights=w;
177  m_weights_set=true;
178 }
179 
181 {
182  return m_weights;
183 }
184 
186 {
188  m_weights_set=false;
189 }
190 
192 {
193  m_nominal=ft;
194  m_types_set=true;
195 }
196 
198 {
199  return m_nominal;
200 }
201 
203 {
205  m_types_set=false;
206 }
207 
208 int32_t CCARTree::get_num_folds() const
209 {
210  return m_folds;
211 }
212 
213 void CCARTree::set_num_folds(int32_t folds)
214 {
215  REQUIRE(folds>1,"Number of folds is expected to be greater than 1. Supplied value is %d\n",folds)
216  m_folds=folds;
217 }
218 
219 int32_t CCARTree::get_max_depth() const
220 {
221  return m_max_depth;
222 }
223 
224 void CCARTree::set_max_depth(int32_t depth)
225 {
226  REQUIRE(depth>0,"Max allowed tree depth should be greater than 0. Supplied value is %d\n",depth)
227  m_max_depth=depth;
228 }
229 
231 {
232  return m_min_node_size;
233 }
234 
235 void CCARTree::set_min_node_size(int32_t nsize)
236 {
237  REQUIRE(nsize>0,"Min allowed node size should be greater than 0. Supplied value is %d\n",nsize)
238  m_min_node_size=nsize;
239 }
240 
242 {
243  REQUIRE(ep>=0,"Input epsilon value is expected to be greater than or equal to 0\n")
244  m_label_epsilon=ep;
245 }
246 
248 {
249  REQUIRE(data,"Data required for training\n")
250  REQUIRE(data->get_feature_class()==C_DENSE,"Dense data required for training\n")
251 
252  int32_t num_features=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_features();
253  int32_t num_vectors=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_vectors();
254 
255  if (m_weights_set)
256  {
257  REQUIRE(m_weights.vlen==num_vectors,"Length of weights vector (currently %d) should be same as"
258  " number of vectors in data (presently %d)",m_weights.vlen,num_vectors)
259  }
260  else
261  {
262  // all weights are equal to 1
263  m_weights=SGVector<float64_t>(num_vectors);
265  }
266 
267  if (m_types_set)
268  {
269  REQUIRE(m_nominal.vlen==num_features,"Length of m_nominal vector (currently %d) should "
270  "be same as number of features in data (presently %d)",m_nominal.vlen,num_features)
271  }
272  else
273  {
274  SG_WARNING("Feature types are not specified. All features are considered as continuous in training")
275  m_nominal=SGVector<bool>(num_features);
277  }
278 
280 
281  if (m_apply_cv_pruning)
282  {
283  CDenseFeatures<float64_t>* feats=dynamic_cast<CDenseFeatures<float64_t>*>(data);
285  }
286 
287  return true;
288 }
289 
291 {
292  m_pre_sort=true;
293  m_sorted_features=sorted_feats;
294  m_sorted_indices=sorted_indices;
295 }
296 
298 {
299  SGMatrix<float64_t> mat=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_feature_matrix();
300  sorted_feats = SGMatrix<float64_t>(mat.num_cols, mat.num_rows);
301  sorted_indices = SGMatrix<index_t>(mat.num_cols, mat.num_rows);
302  for(int32_t i=0; i<sorted_indices.num_cols; i++)
303  for(int32_t j=0; j<sorted_indices.num_rows; j++)
304  sorted_indices(j,i)=j;
305 
306  Map<MatrixXd> map_sorted_feats(sorted_feats.matrix, mat.num_cols, mat.num_rows);
307  Map<MatrixXd> map_data(mat.matrix, mat.num_rows, mat.num_cols);
308 
309  map_sorted_feats=map_data.transpose();
310 
311  #pragma omp parallel for
312  for(int32_t i=0; i<sorted_feats.num_cols; i++)
313  CMath::qsort_index(sorted_feats.get_column_vector(i), sorted_indices.get_column_vector(i), sorted_feats.num_rows);
314 
315 }
316 
318 {
319  REQUIRE(labels,"labels have to be supplied\n");
320  REQUIRE(data,"data matrix has to be supplied\n");
321 
322  bnode_t* node=new bnode_t();
323  SGVector<float64_t> labels_vec=(dynamic_cast<CDenseLabels*>(labels))->get_labels();
324  SGMatrix<float64_t> mat=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_feature_matrix();
325  int32_t num_feats=mat.num_rows;
326  int32_t num_vecs=mat.num_cols;
327 
328  // calculate node label
329  switch(m_mode)
330  {
331  case PT_REGRESSION:
332  {
333  float64_t sum=0;
334  for (int32_t i=0;i<labels_vec.vlen;i++)
335  sum+=labels_vec[i]*weights[i];
336 
337  // lsd*total_weight=sum_of_squared_deviation
338  float64_t tot=0;
339  node->data.weight_minus_node=tot*least_squares_deviation(labels_vec,weights,tot);
340  node->data.node_label=sum/tot;
341  node->data.total_weight=tot;
342 
343  break;
344  }
345  case PT_MULTICLASS:
346  {
347  SGVector<float64_t> lab=labels_vec.clone();
348  CMath::qsort(lab);
349  // stores max total weight for a single label
350  int32_t max=weights[0];
351  // stores one of the indices having max total weight
352  int32_t maxi=0;
353  int32_t c=weights[0];
354  for (int32_t i=1;i<lab.vlen;i++)
355  {
356  if (lab[i]==lab[i-1])
357  {
358  c+=weights[i];
359  }
360  else if (c>max)
361  {
362  max=c;
363  maxi=i-1;
364  c=weights[i];
365  }
366  else
367  {
368  c=weights[i];
369  }
370  }
371 
372  if (c>max)
373  {
374  max=c;
375  maxi=lab.vlen-1;
376  }
377 
378  node->data.node_label=lab[maxi];
379 
380  // resubstitution error calculation
381  node->data.total_weight=weights.sum(weights);
382  node->data.weight_minus_node=node->data.total_weight-max;
383  break;
384  }
385  default :
386  SG_ERROR("mode should be either PT_MULTICLASS or PT_REGRESSION\n");
387  }
388 
389  // check stopping rules
390  // case 1 : max tree depth reached if max_depth set
391  if ((m_max_depth>0) && (level==m_max_depth))
392  {
393  node->data.num_leaves=1;
394  node->data.weight_minus_branch=node->data.weight_minus_node;
395  return node;
396  }
397 
398  // case 2 : min node size violated if min_node_size specified
399  if ((m_min_node_size>1) && (labels_vec.vlen<=m_min_node_size))
400  {
401  node->data.num_leaves=1;
402  node->data.weight_minus_branch=node->data.weight_minus_node;
403  return node;
404  }
405 
406  // choose best attribute
407  // transit_into_values for left child
408  SGVector<float64_t> left(num_feats);
409  // transit_into_values for right child
410  SGVector<float64_t> right(num_feats);
411  // final data distribution among children
412  SGVector<bool> left_final(num_vecs);
413  int32_t num_missing_final=0;
414  int32_t c_left=-1;
415  int32_t c_right=-1;
416  int32_t best_attribute;
417 
418  SGVector<index_t> indices(num_vecs);
419  if (m_pre_sort)
420  {
421  CSubsetStack* subset_stack = data->get_subset_stack();
422  if (subset_stack->has_subsets())
423  indices=(subset_stack->get_last_subset())->get_subset_idx();
424  else
425  indices.range_fill();
426  SG_UNREF(subset_stack);
427  best_attribute=compute_best_attribute(m_sorted_features,weights,labels,left,right,left_final,num_missing_final,c_left,c_right,0,indices);
428  }
429  else
430  best_attribute=compute_best_attribute(mat,weights,labels,left,right,left_final,num_missing_final,c_left,c_right);
431 
432  if (best_attribute==-1)
433  {
434  node->data.num_leaves=1;
435  node->data.weight_minus_branch=node->data.weight_minus_node;
436  return node;
437  }
438 
439  SGVector<float64_t> left_transit(c_left);
440  SGVector<float64_t> right_transit(c_right);
441  memcpy(left_transit.vector,left.vector,c_left*sizeof(float64_t));
442  memcpy(right_transit.vector,right.vector,c_right*sizeof(float64_t));
443 
444  if (num_missing_final>0)
445  {
446  SGVector<bool> is_left_final(num_vecs-num_missing_final);
447  int32_t ilf=0;
448  for (int32_t i=0;i<num_vecs;i++)
449  {
450  if (mat(best_attribute,i)!=MISSING)
451  is_left_final[ilf++]=left_final[i];
452  }
453 
454  left_final=surrogate_split(mat,weights,is_left_final,best_attribute);
455  }
456 
457  int32_t count_left=0;
458  for (int32_t c=0;c<num_vecs;c++)
459  count_left=(left_final[c])?count_left+1:count_left;
460 
461  SGVector<index_t> subsetl(count_left);
462  SGVector<float64_t> weightsl(count_left);
463  SGVector<index_t> subsetr(num_vecs-count_left);
464  SGVector<float64_t> weightsr(num_vecs-count_left);
465  index_t l=0;
466  index_t r=0;
467  for (int32_t c=0;c<num_vecs;c++)
468  {
469  if (left_final[c])
470  {
471  subsetl[l]=c;
472  weightsl[l++]=weights[c];
473  }
474  else
475  {
476  subsetr[r]=c;
477  weightsr[r++]=weights[c];
478  }
479  }
480 
481  // left child
482  data->add_subset(subsetl);
483  labels->add_subset(subsetl);
484  bnode_t* left_child=CARTtrain(data,weightsl,labels,level+1);
485  data->remove_subset();
486  labels->remove_subset();
487 
488  // right child
489  data->add_subset(subsetr);
490  labels->add_subset(subsetr);
491  bnode_t* right_child=CARTtrain(data,weightsr,labels,level+1);
492  data->remove_subset();
493  labels->remove_subset();
494 
495  // set node parameters
496  node->data.attribute_id=best_attribute;
497  node->left(left_child);
498  node->right(right_child);
499  left_child->data.transit_into_values=left_transit;
500  right_child->data.transit_into_values=right_transit;
501  node->data.num_leaves=left_child->data.num_leaves+right_child->data.num_leaves;
502  node->data.weight_minus_branch=left_child->data.weight_minus_branch+right_child->data.weight_minus_branch;
503 
504  return node;
505 }
506 
508 {
509  float64_t delta=0;
510  if (m_mode==PT_REGRESSION)
511  delta=m_label_epsilon;
512 
513  SGVector<float64_t> ulabels(labels_vec.vlen);
514  SGVector<index_t> sidx=CMath::argsort(labels_vec);
515  ulabels[0]=labels_vec[sidx[0]];
516  n_ulabels=1;
517  int32_t start=0;
518  for (int32_t i=1;i<sidx.vlen;i++)
519  {
520  if (labels_vec[sidx[i]]<=labels_vec[sidx[start]]+delta)
521  continue;
522 
523  start=i;
524  ulabels[n_ulabels]=labels_vec[sidx[i]];
525  n_ulabels++;
526  }
527 
528  return ulabels;
529 }
530 
532  SGVector<float64_t>& left, SGVector<float64_t>& right, SGVector<bool>& is_left_final, int32_t &num_missing_final, int32_t &count_left,
533  int32_t &count_right, int32_t subset_size, const SGVector<index_t>& active_indices)
534 {
535  SGVector<float64_t> labels_vec=(dynamic_cast<CDenseLabels*>(labels))->get_labels();
536  int32_t num_vecs=labels->get_num_labels();
537  int32_t num_feats;
538  if (m_pre_sort)
539  num_feats=mat.num_cols;
540  else
541  num_feats=mat.num_rows;
542 
543  int32_t n_ulabels;
544  SGVector<float64_t> ulabels=get_unique_labels(labels_vec,n_ulabels);
545 
546  // if all labels same early stop
547  if (n_ulabels==1)
548  return -1;
549 
550  float64_t delta=0;
551  if (m_mode==PT_REGRESSION)
552  delta=m_label_epsilon;
553 
554  SGVector<float64_t> total_wclasses(n_ulabels);
555  total_wclasses.zero();
556 
557  SGVector<int32_t> simple_labels(num_vecs);
558  for (int32_t i=0;i<num_vecs;i++)
559  {
560  for (int32_t j=0;j<n_ulabels;j++)
561  {
562  if (CMath::abs(labels_vec[i]-ulabels[j])<=delta)
563  {
564  simple_labels[i]=j;
565  total_wclasses[j]+=weights[i];
566  break;
567  }
568  }
569  }
570 
571  SGVector<index_t> idx(num_feats);
572  idx.range_fill();
573  if (subset_size)
574  {
575  num_feats=subset_size;
576  CMath::permute(idx);
577  }
578 
579  float64_t max_gain=MIN_SPLIT_GAIN;
580  int32_t best_attribute=-1;
581  float64_t best_threshold=0;
582 
583  SGVector<int64_t> indices_mask;
584  SGVector<int32_t> count_indices(mat.num_rows);
585  count_indices.zero();
586  SGVector<int32_t> dupes(num_vecs);
587  dupes.range_fill();
588  if (m_pre_sort)
589  {
590  indices_mask = SGVector<int64_t>(mat.num_rows);
591  indices_mask.set_const(-1);
592  for(int32_t j=0;j<active_indices.size();j++)
593  {
594  if (indices_mask[active_indices[j]]>=0)
595  dupes[indices_mask[active_indices[j]]]=j;
596 
597  indices_mask[active_indices[j]]=j;
598  count_indices[active_indices[j]]++;
599  }
600  }
601 
602  for (int32_t i=0;i<num_feats;i++)
603  {
604  SGVector<float64_t> feats(num_vecs);
605  SGVector<index_t> sorted_args(num_vecs);
606 
607  if (m_pre_sort)
608  {
609  SGVector<float64_t> temp_col(mat.get_column_vector(idx[i]), mat.num_rows, false);
610  SGVector<index_t> sorted_indices(m_sorted_indices.get_column_vector(idx[i]), mat.num_rows, false);
611  int32_t count=0;
612  for(int32_t j=0;j<mat.num_rows;j++)
613  {
614  if (indices_mask[sorted_indices[j]]>=0)
615  {
616  int32_t count_index = count_indices[sorted_indices[j]];
617  while(count_index>0)
618  {
619  feats[count]=temp_col[j];
620  sorted_args[count]=indices_mask[sorted_indices[j]];
621  ++count;
622  --count_index;
623  }
624  if (count==num_vecs)
625  break;
626  }
627  }
628  }
629  else
630  {
631  for (int32_t j=0;j<num_vecs;j++)
632  feats[j]=mat(idx[i],j);
633 
634  // O(N*logN)
635  sorted_args.range_fill();
636  CMath::qsort_index(feats.vector, sorted_args.vector, feats.size());
637  }
638  int32_t n_nm_vecs=feats.vlen;
639  // number of non-missing vecs
640  while (feats[n_nm_vecs-1]==MISSING)
641  {
642  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]-=weights[sorted_args[n_nm_vecs-1]];
643  n_nm_vecs--;
644  }
645 
646  // if only one unique value - it cannot be used to split
647  if (feats[n_nm_vecs-1]<=feats[0]+EQ_DELTA)
648  continue;
649 
650  if (m_nominal[idx[i]])
651  {
652  SGVector<int32_t> simple_feats(num_vecs);
653  simple_feats.fill_vector(simple_feats.vector,simple_feats.vlen,-1);
654 
655  // convert to simple values
656  simple_feats[0]=0;
657  int32_t c=0;
658  for (int32_t j=1;j<n_nm_vecs;j++)
659  {
660  if (feats[j]==feats[j-1])
661  simple_feats[j]=c;
662  else
663  simple_feats[j]=(++c);
664  }
665 
666  SGVector<float64_t> ufeats(c+1);
667  ufeats[0]=feats[0];
668  int32_t u=0;
669  for (int32_t j=1;j<n_nm_vecs;j++)
670  {
671  if (feats[j]==feats[j-1])
672  continue;
673  else
674  ufeats[++u]=feats[j];
675  }
676 
677  // test all 2^(I-1)-1 possible division between two nodes
678  int32_t num_cases=CMath::pow(2,c);
679  for (int32_t k=1;k<num_cases;k++)
680  {
681  SGVector<float64_t> wleft(n_ulabels);
682  SGVector<float64_t> wright(n_ulabels);
683  wleft.zero();
684  wright.zero();
685 
686  // stores which vectors are assigned to left child
687  SGVector<bool> is_left(num_vecs);
688  is_left.fill_vector(is_left.vector,is_left.vlen,false);
689 
690  // stores which among the categorical values of chosen attribute are assigned left child
691  SGVector<bool> feats_left(c+1);
692 
693  // fill feats_left in a unique way corresponding to the case
694  for (int32_t p=0;p<c+1;p++)
695  feats_left[p]=((k/CMath::pow(2,p))%(CMath::pow(2,p+1))==1);
696 
697  // form is_left
698  for (int32_t j=0;j<n_nm_vecs;j++)
699  {
700  is_left[sorted_args[j]]=feats_left[simple_feats[j]];
701  if (is_left[sorted_args[j]])
702  wleft[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
703  else
704  wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
705  }
706  for (int32_t j=n_nm_vecs-1;j>=0;j--)
707  {
708  if(dupes[j]!=j)
709  is_left[j]=is_left[dupes[j]];
710  }
711 
712  float64_t g=0;
713  if (m_mode==PT_MULTICLASS)
714  g=gain(wleft,wright,total_wclasses);
715  else if (m_mode==PT_REGRESSION)
716  g=gain(wleft,wright,total_wclasses,ulabels);
717  else
718  SG_ERROR("Undefined problem statement\n");
719 
720  if (g>max_gain)
721  {
722  best_attribute=idx[i];
723  max_gain=g;
724  memcpy(is_left_final.vector,is_left.vector,is_left.vlen*sizeof(bool));
725  num_missing_final=num_vecs-n_nm_vecs;
726 
727  count_left=0;
728  for (int32_t l=0;l<c+1;l++)
729  count_left=(feats_left[l])?count_left+1:count_left;
730 
731  count_right=c+1-count_left;
732 
733  int32_t l=0;
734  int32_t r=0;
735  for (int32_t w=0;w<c+1;w++)
736  {
737  if (feats_left[w])
738  left[l++]=ufeats[w];
739  else
740  right[r++]=ufeats[w];
741  }
742  }
743  }
744  }
745  else
746  {
747  // O(N)
748  SGVector<float64_t> right_wclasses=total_wclasses.clone();
749  SGVector<float64_t> left_wclasses(n_ulabels);
750  left_wclasses.zero();
751 
752  // O(N)
753  // find best split for non-nominal attribute - choose threshold (z)
754  float64_t z=feats[0];
755  right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]];
756  left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]];
757  for (int32_t j=1;j<n_nm_vecs;j++)
758  {
759  if (feats[j]<=z+EQ_DELTA)
760  {
761  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
762  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
763  continue;
764  }
765  // O(F)
766  float64_t g=0;
767  if (m_mode==PT_MULTICLASS)
768  g=gain(left_wclasses,right_wclasses,total_wclasses);
769  else if (m_mode==PT_REGRESSION)
770  g=gain(left_wclasses,right_wclasses,total_wclasses,ulabels);
771  else
772  SG_ERROR("Undefined problem statement\n");
773 
774  if (g>max_gain)
775  {
776  max_gain=g;
777  best_attribute=idx[i];
778  best_threshold=z;
779  num_missing_final=num_vecs-n_nm_vecs;
780  }
781 
782  z=feats[j];
783  if (feats[n_nm_vecs-1]<=z+EQ_DELTA)
784  break;
785  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
786  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
787  }
788  }
789 
790  // restore total_wclasses
791  while (n_nm_vecs<feats.vlen)
792  {
793  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]+=weights[sorted_args[n_nm_vecs-1]];
794  n_nm_vecs++;
795  }
796  }
797 
798  if (best_attribute==-1)
799  return -1;
800 
801  if (!m_nominal[best_attribute])
802  {
803  left[0]=best_threshold;
804  right[0]=best_threshold;
805  count_left=1;
806  count_right=1;
807  if (m_pre_sort)
808  {
809  SGVector<float64_t> temp_vec(mat.get_column_vector(best_attribute), mat.num_rows, false);
810  SGVector<index_t> sorted_indices(m_sorted_indices.get_column_vector(best_attribute), mat.num_rows, false);
811  int32_t count=0;
812  for(int32_t i=0;i<mat.num_rows;i++)
813  {
814  if (indices_mask[sorted_indices[i]]>=0)
815  {
816  is_left_final[indices_mask[sorted_indices[i]]]=(temp_vec[i]<=best_threshold);
817  ++count;
818  if (count==num_vecs)
819  break;
820  }
821  }
822  for (int32_t i=num_vecs-1;i>=0;i--)
823  {
824  if(dupes[i]!=i)
825  is_left_final[i]=is_left_final[dupes[i]];
826  }
827 
828  }
829  else
830  {
831  for (int32_t i=0;i<num_vecs;i++)
832  is_left_final[i]=(mat(best_attribute,i)<=best_threshold);
833  }
834  }
835 
836  return best_attribute;
837 }
838 
840 {
841  // return vector - left/right belongingness
842  SGVector<bool> ret(m.num_cols);
843 
844  // ditribute data with known attributes
845  int32_t l=0;
846  float64_t p_l=0.;
847  float64_t total=0.;
848  // stores indices of vectors with missing attribute
850  // stores lambda values corresponding to missing vectors - initialized all with 0
851  CDynamicArray<float64_t>* association_index=new CDynamicArray<float64_t>();
852  for (int32_t i=0;i<m.num_cols;i++)
853  {
854  if (!CMath::fequals(m(attr,i),MISSING,0))
855  {
856  ret[i]=nm_left[l];
857  total+=weights[i];
858  if (nm_left[l++])
859  p_l+=weights[i];
860  }
861  else
862  {
863  missing_vecs->push_back(i);
864  association_index->push_back(0.);
865  }
866  }
867 
868  // for lambda calculation
869  float64_t p_r=(total-p_l)/total;
870  p_l/=total;
871  float64_t p=CMath::min(p_r,p_l);
872 
873  // for each attribute (X') alternative to best split (X)
874  for (int32_t i=0;i<m.num_rows;i++)
875  {
876  if (i==attr)
877  continue;
878 
879  // find set of vectors with non-missing values for both X and X'
880  CDynamicArray<int32_t>* intersect_vecs=new CDynamicArray<int32_t>();
881  for (int32_t j=0;j<m.num_cols;j++)
882  {
883  if (!(CMath::fequals(m(i,j),MISSING,0) || CMath::fequals(m(attr,j),MISSING,0)))
884  intersect_vecs->push_back(j);
885  }
886 
887  if (intersect_vecs->get_num_elements()==0)
888  {
889  SG_UNREF(intersect_vecs);
890  continue;
891  }
892 
893 
894  if (m_nominal[i])
895  handle_missing_vecs_for_nominal_surrogate(m,missing_vecs,association_index,intersect_vecs,ret,weights,p,i);
896  else
897  handle_missing_vecs_for_continuous_surrogate(m,missing_vecs,association_index,intersect_vecs,ret,weights,p,i);
898 
899  SG_UNREF(intersect_vecs);
900  }
901 
902  // if some missing attribute vectors are yet not addressed, use majority rule
903  for (int32_t i=0;i<association_index->get_num_elements();i++)
904  {
905  if (association_index->get_element(i)==0.)
906  ret[missing_vecs->get_element(i)]=(p_l>=p_r);
907  }
908 
909  SG_UNREF(missing_vecs);
910  SG_UNREF(association_index);
911  return ret;
912 }
913 
915  CDynamicArray<float64_t>* association_index, CDynamicArray<int32_t>* intersect_vecs, SGVector<bool> is_left,
916  SGVector<float64_t> weights, float64_t p, int32_t attr)
917 {
918  // for lambda calculation - total weight of all vectors in X intersect X'
919  float64_t denom=0.;
920  SGVector<float64_t> feats(intersect_vecs->get_num_elements());
921  for (int32_t j=0;j<intersect_vecs->get_num_elements();j++)
922  {
923  feats[j]=m(attr,intersect_vecs->get_element(j));
924  denom+=weights[intersect_vecs->get_element(j)];
925  }
926 
927  // unique feature values for X'
928  int32_t num_unique=feats.unique(feats.vector,feats.vlen);
929 
930 
931  // all possible splits for chosen attribute
932  for (int32_t j=0;j<num_unique-1;j++)
933  {
934  float64_t z=feats[j];
935  float64_t numer=0.;
936  float64_t numerc=0.;
937  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
938  {
939  // if both go left or both go right
940  if ((m(attr,intersect_vecs->get_element(k))<=z) && is_left[intersect_vecs->get_element(k)])
941  numer+=weights[intersect_vecs->get_element(k)];
942  else if ((m(attr,intersect_vecs->get_element(k))>z) && !is_left[intersect_vecs->get_element(k)])
943  numer+=weights[intersect_vecs->get_element(k)];
944  // complementary split cases - one goes left other right
945  else if ((m(attr,intersect_vecs->get_element(k))<=z) && !is_left[intersect_vecs->get_element(k)])
946  numerc+=weights[intersect_vecs->get_element(k)];
947  else if ((m(attr,intersect_vecs->get_element(k))>z) && is_left[intersect_vecs->get_element(k)])
948  numerc+=weights[intersect_vecs->get_element(k)];
949  }
950 
951  float64_t lambda=0.;
952  if (numer>=numerc)
953  lambda=(p-(1-numer/denom))/p;
954  else
955  lambda=(p-(1-numerc/denom))/p;
956  for (int32_t k=0;k<missing_vecs->get_num_elements();k++)
957  {
958  if ((lambda>association_index->get_element(k)) &&
959  (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0)))
960  {
961  association_index->set_element(lambda,k);
962  if (numer>=numerc)
963  is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))<=z);
964  else
965  is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))>z);
966  }
967  }
968  }
969 }
970 
972  CDynamicArray<float64_t>* association_index, CDynamicArray<int32_t>* intersect_vecs, SGVector<bool> is_left,
973  SGVector<float64_t> weights, float64_t p, int32_t attr)
974 {
975  // for lambda calculation - total weight of all vectors in X intersect X'
976  float64_t denom=0.;
977  SGVector<float64_t> feats(intersect_vecs->get_num_elements());
978  for (int32_t j=0;j<intersect_vecs->get_num_elements();j++)
979  {
980  feats[j]=m(attr,intersect_vecs->get_element(j));
981  denom+=weights[intersect_vecs->get_element(j)];
982  }
983 
984  // unique feature values for X'
985  int32_t num_unique=feats.unique(feats.vector,feats.vlen);
986 
987  // scan all splits for chosen alternative attribute X'
988  int32_t num_cases=CMath::pow(2,(num_unique-1));
989  for (int32_t j=1;j<num_cases;j++)
990  {
991  SGVector<bool> feats_left(num_unique);
992  for (int32_t k=0;k<num_unique;k++)
993  feats_left[k]=((j/CMath::pow(2,k))%(CMath::pow(2,k+1))==1);
994 
995  SGVector<bool> intersect_vecs_left(intersect_vecs->get_num_elements());
996  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
997  {
998  for (int32_t q=0;q<num_unique;q++)
999  {
1000  if (feats[q]==m(attr,intersect_vecs->get_element(k)))
1001  {
1002  intersect_vecs_left[k]=feats_left[q];
1003  break;
1004  }
1005  }
1006  }
1007 
1008  float64_t numer=0.;
1009  float64_t numerc=0.;
1010  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
1011  {
1012  // if both go left or both go right
1013  if (intersect_vecs_left[k]==is_left[intersect_vecs->get_element(k)])
1014  numer+=weights[intersect_vecs->get_element(k)];
1015  else
1016  numerc+=weights[intersect_vecs->get_element(k)];
1017  }
1018 
1019  // lambda for this split (2 case identical split/complementary split)
1020  float64_t lambda=0.;
1021  if (numer>=numerc)
1022  lambda=(p-(1-numer/denom))/p;
1023  else
1024  lambda=(p-(1-numerc/denom))/p;
1025 
1026  // address missing value vectors not yet addressed or addressed using worse split
1027  for (int32_t k=0;k<missing_vecs->get_num_elements();k++)
1028  {
1029  if ((lambda>association_index->get_element(k)) &&
1030  (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0)))
1031  {
1032  association_index->set_element(lambda,k);
1033  // decide left/right based on which feature value the chosen data point has
1034  for (int32_t q=0;q<num_unique;q++)
1035  {
1036  if (feats[q]==m(attr,missing_vecs->get_element(k)))
1037  {
1038  if (numer>=numerc)
1039  is_left[missing_vecs->get_element(k)]=feats_left[q];
1040  else
1041  is_left[missing_vecs->get_element(k)]=!feats_left[q];
1042 
1043  break;
1044  }
1045  }
1046  }
1047  }
1048  }
1049 }
1050 
1052  SGVector<float64_t> feats)
1053 {
1054  float64_t total_lweight=0;
1055  float64_t total_rweight=0;
1056  float64_t total_weight=0;
1057 
1058  float64_t lsd_n=least_squares_deviation(feats,wtotal,total_weight);
1059  float64_t lsd_l=least_squares_deviation(feats,wleft,total_lweight);
1060  float64_t lsd_r=least_squares_deviation(feats,wright,total_rweight);
1061 
1062  return lsd_n-(lsd_l*(total_lweight/total_weight))-(lsd_r*(total_rweight/total_weight));
1063 }
1064 
1066 {
1067  float64_t total_lweight=0;
1068  float64_t total_rweight=0;
1069  float64_t total_weight=0;
1070 
1071  float64_t gini_n=gini_impurity_index(wtotal,total_weight);
1072  float64_t gini_l=gini_impurity_index(wleft,total_lweight);
1073  float64_t gini_r=gini_impurity_index(wright,total_rweight);
1074  return gini_n-(gini_l*(total_lweight/total_weight))-(gini_r*(total_rweight/total_weight));
1075 }
1076 
1077 float64_t CCARTree::gini_impurity_index(const SGVector<float64_t>& weighted_lab_classes, float64_t &total_weight)
1078 {
1079  Map<VectorXd> map_weighted_lab_classes(weighted_lab_classes.vector, weighted_lab_classes.size());
1080  total_weight=map_weighted_lab_classes.sum();
1081  float64_t gini=map_weighted_lab_classes.dot(map_weighted_lab_classes);
1082 
1083  gini=1.0-(gini/(total_weight*total_weight));
1084  return gini;
1085 }
1086 
1088 {
1089 
1090  Map<VectorXd> map_weights(weights.vector, weights.size());
1091  Map<VectorXd> map_feats(feats.vector, weights.size());
1092  float64_t mean=map_weights.dot(map_feats);
1093  total_weight=map_weights.sum();
1094 
1095  mean/=total_weight;
1096  float64_t dev=0;
1097  for (int32_t i=0;i<weights.vlen;i++)
1098  dev+=weights[i]*(feats[i]-mean)*(feats[i]-mean);
1099 
1100  return dev/total_weight;
1101 }
1102 
1104 {
1105  int32_t num_vecs=feats->get_num_vectors();
1106  REQUIRE(num_vecs>0, "No data provided in apply\n");
1107 
1108  SGVector<float64_t> labels(num_vecs);
1109  for (int32_t i=0;i<num_vecs;i++)
1110  {
1111  SGVector<float64_t> sample=feats->get_feature_vector(i);
1112  bnode_t* node=current;
1113  SG_REF(node);
1114 
1115  // until leaf is reached
1116  while(node->data.num_leaves!=1)
1117  {
1118  bnode_t* leftchild=node->left();
1119 
1120  if (m_nominal[node->data.attribute_id])
1121  {
1122  SGVector<float64_t> comp=leftchild->data.transit_into_values;
1123  bool flag=false;
1124  for (int32_t k=0;k<comp.vlen;k++)
1125  {
1126  if (comp[k]==sample[node->data.attribute_id])
1127  {
1128  flag=true;
1129  break;
1130  }
1131  }
1132 
1133  if (flag)
1134  {
1135  SG_UNREF(node);
1136  node=leftchild;
1137  SG_REF(leftchild);
1138  }
1139  else
1140  {
1141  SG_UNREF(node);
1142  node=node->right();
1143  }
1144  }
1145  else
1146  {
1147  if (sample[node->data.attribute_id]<=leftchild->data.transit_into_values[0])
1148  {
1149  SG_UNREF(node);
1150  node=leftchild;
1151  SG_REF(leftchild);
1152  }
1153  else
1154  {
1155  SG_UNREF(node);
1156  node=node->right();
1157  }
1158  }
1159 
1160  SG_UNREF(leftchild);
1161  }
1162 
1163  labels[i]=node->data.node_label;
1164  SG_UNREF(node);
1165  }
1166 
1167  switch(m_mode)
1168  {
1169  case PT_MULTICLASS:
1170  {
1171  CMulticlassLabels* mlabels=new CMulticlassLabels(labels);
1172  return mlabels;
1173  }
1174 
1175  case PT_REGRESSION:
1176  {
1177  CRegressionLabels* rlabels=new CRegressionLabels(labels);
1178  return rlabels;
1179  }
1180 
1181  default:
1182  SG_ERROR("mode should be either PT_MULTICLASS or PT_REGRESSION\n");
1183  }
1184 
1185  return NULL;
1186 }
1187 
1189 {
1190  int32_t num_vecs=data->get_num_vectors();
1191 
1192  // divide data into V folds randomly
1193  SGVector<int32_t> subid(num_vecs);
1194  subid.random_vector(subid.vector,subid.vlen,0,folds-1);
1195 
1196  // for each fold subset
1199  SGVector<int32_t> num_alphak(folds);
1200  for (int32_t i=0;i<folds;i++)
1201  {
1202  // for chosen fold, create subset for training parameters
1203  CDynamicArray<int32_t>* test_indices=new CDynamicArray<int32_t>();
1204  CDynamicArray<int32_t>* train_indices=new CDynamicArray<int32_t>();
1205  for (int32_t j=0;j<num_vecs;j++)
1206  {
1207  if (subid[j]==i)
1208  test_indices->push_back(j);
1209  else
1210  train_indices->push_back(j);
1211  }
1212 
1213  if (test_indices->get_num_elements()==0 || train_indices->get_num_elements()==0)
1214  {
1215  SG_ERROR("Unfortunately you have reached the very low probability event where atleast one of "
1216  "the subsets in cross-validation is not represented at all. Please re-run.")
1217  }
1218 
1219  SGVector<int32_t> subset(train_indices->get_array(),train_indices->get_num_elements(),false);
1220  data->add_subset(subset);
1221  m_labels->add_subset(subset);
1222  SGVector<float64_t> subset_weights(train_indices->get_num_elements());
1223  for (int32_t j=0;j<train_indices->get_num_elements();j++)
1224  subset_weights[j]=m_weights[train_indices->get_element(j)];
1225 
1226  // train with training subset
1227  bnode_t* root=CARTtrain(data,subset_weights,m_labels,0);
1228 
1229  // prune trained tree
1231  tmax->set_root(root);
1232  CDynamicObjectArray* pruned_trees=prune_tree(tmax);
1233 
1234  data->remove_subset();
1236  subset=SGVector<int32_t>(test_indices->get_array(),test_indices->get_num_elements(),false);
1237  data->add_subset(subset);
1238  m_labels->add_subset(subset);
1239  subset_weights=SGVector<float64_t>(test_indices->get_num_elements());
1240  for (int32_t j=0;j<test_indices->get_num_elements();j++)
1241  subset_weights[j]=m_weights[test_indices->get_element(j)];
1242 
1243  // calculate R_CV values for each alpha_k using test subset and store them
1244  num_alphak[i]=m_alphas->get_num_elements();
1245  for (int32_t j=0;j<m_alphas->get_num_elements();j++)
1246  {
1247  alphak->push_back(m_alphas->get_element(j));
1248  CSGObject* jth_element=pruned_trees->get_element(j);
1249  bnode_t* current_root=NULL;
1250  if (jth_element!=NULL)
1251  current_root=dynamic_cast<bnode_t*>(jth_element);
1252  else
1253  SG_ERROR("%d element is NULL which should not be",j);
1254 
1255  CLabels* labels=apply_from_current_node(data, current_root);
1256  float64_t error=compute_error(labels, m_labels, subset_weights);
1257  r_cv->push_back(error);
1258  SG_UNREF(labels);
1259  SG_UNREF(jth_element);
1260  }
1261 
1262  data->remove_subset();
1264  SG_UNREF(train_indices);
1265  SG_UNREF(test_indices);
1266  SG_UNREF(tmax);
1267  SG_UNREF(pruned_trees);
1268  }
1269 
1270  // prune the original T_max
1271  CDynamicObjectArray* pruned_trees=prune_tree(this);
1272 
1273  // find subtree with minimum R_cv
1274  int32_t min_index=-1;
1276  for (int32_t i=0;i<m_alphas->get_num_elements();i++)
1277  {
1278  float64_t alpha=0.;
1279  if (i==m_alphas->get_num_elements()-1)
1280  alpha=m_alphas->get_element(i)+1;
1281  else
1283 
1284  float64_t rv=0.;
1285  int32_t base=0;
1286  for (int32_t j=0;j<folds;j++)
1287  {
1288  bool flag=false;
1289  for (int32_t k=base;k<num_alphak[j]+base-1;k++)
1290  {
1291  if (alphak->get_element(k)<=alpha && alphak->get_element(k+1)>alpha)
1292  {
1293  rv+=r_cv->get_element(k);
1294  flag=true;
1295  break;
1296  }
1297  }
1298 
1299  if (!flag)
1300  rv+=r_cv->get_element(num_alphak[j]+base-1);
1301 
1302  base+=num_alphak[j];
1303  }
1304 
1305  if (rv<min_r_cv)
1306  {
1307  min_index=i;
1308  min_r_cv=rv;
1309  }
1310  }
1311 
1312  CSGObject* element=pruned_trees->get_element(min_index);
1313  bnode_t* best_tree_root=NULL;
1314  if (element!=NULL)
1315  best_tree_root=dynamic_cast<bnode_t*>(element);
1316  else
1317  SG_ERROR("%d element is NULL which should not be",min_index);
1318 
1319  this->set_root(best_tree_root);
1320 
1321  SG_UNREF(element);
1322  SG_UNREF(pruned_trees);
1323  SG_UNREF(r_cv);
1324  SG_UNREF(alphak);
1325 }
1326 
1328 {
1329  REQUIRE(labels,"input labels cannot be NULL");
1330  REQUIRE(reference,"reference labels cannot be NULL")
1331 
1332  CDenseLabels* gnd_truth=dynamic_cast<CDenseLabels*>(reference);
1333  CDenseLabels* result=dynamic_cast<CDenseLabels*>(labels);
1334 
1335  float64_t denom=weights.sum(weights);
1336  float64_t numer=0.;
1337  switch (m_mode)
1338  {
1339  case PT_MULTICLASS:
1340  {
1341  for (int32_t i=0;i<weights.vlen;i++)
1342  {
1343  if (gnd_truth->get_label(i)!=result->get_label(i))
1344  numer+=weights[i];
1345  }
1346 
1347  return numer/denom;
1348  }
1349 
1350  case PT_REGRESSION:
1351  {
1352  for (int32_t i=0;i<weights.vlen;i++)
1353  numer+=weights[i]*CMath::pow((gnd_truth->get_label(i)-result->get_label(i)),2);
1354 
1355  return numer/denom;
1356  }
1357 
1358  default:
1359  SG_ERROR("Case not possible\n");
1360  }
1361 
1362  return 0.;
1363 }
1364 
1366 {
1367  REQUIRE(tree, "Tree not provided for pruning.\n");
1368 
1370  SG_UNREF(m_alphas);
1372  SG_REF(m_alphas);
1373 
1374  // base tree alpha_k=0
1375  m_alphas->push_back(0);
1377  SG_REF(t1);
1378  node_t* t1root=t1->get_root();
1379  bnode_t* t1_root=NULL;
1380  if (t1root!=NULL)
1381  t1_root=dynamic_cast<bnode_t*>(t1root);
1382  else
1383  SG_ERROR("t1_root is NULL. This is not expected\n")
1384 
1385  form_t1(t1_root);
1386  trees->push_back(t1_root);
1387  while(t1_root->data.num_leaves>1)
1388  {
1390  SG_REF(t2);
1391 
1392  node_t* t2root=t2->get_root();
1393  bnode_t* t2_root=NULL;
1394  if (t2root!=NULL)
1395  t2_root=dynamic_cast<bnode_t*>(t2root);
1396  else
1397  SG_ERROR("t1_root is NULL. This is not expected\n")
1398 
1399  float64_t a_k=find_weakest_alpha(t2_root);
1400  m_alphas->push_back(a_k);
1401  cut_weakest_link(t2_root,a_k);
1402  trees->push_back(t2_root);
1403 
1404  SG_UNREF(t1);
1405  SG_UNREF(t1_root);
1406  t1=t2;
1407  t1_root=t2_root;
1408  }
1409 
1410  SG_UNREF(t1);
1411  SG_UNREF(t1_root);
1412  return trees;
1413 }
1414 
1416 {
1417  if (node->data.num_leaves!=1)
1418  {
1419  bnode_t* left=node->left();
1420  bnode_t* right=node->right();
1421 
1422  SGVector<float64_t> weak_links(3);
1423  weak_links[0]=find_weakest_alpha(left);
1424  weak_links[1]=find_weakest_alpha(right);
1425  weak_links[2]=(node->data.weight_minus_node-node->data.weight_minus_branch)/node->data.total_weight;
1426  weak_links[2]/=(node->data.num_leaves-1.0);
1427 
1428  SG_UNREF(left);
1429  SG_UNREF(right);
1430  return CMath::min(weak_links.vector,weak_links.vlen);
1431  }
1432 
1433  return CMath::MAX_REAL_NUMBER;
1434 }
1435 
1437 {
1438  if (node->data.num_leaves==1)
1439  return;
1440 
1441  float64_t g=(node->data.weight_minus_node-node->data.weight_minus_branch)/node->data.total_weight;
1442  g/=(node->data.num_leaves-1.0);
1443  if (alpha==g)
1444  {
1445  node->data.num_leaves=1;
1446  node->data.weight_minus_branch=node->data.weight_minus_node;
1447  CDynamicObjectArray* children=new CDynamicObjectArray();
1448  node->set_children(children);
1449 
1450  SG_UNREF(children);
1451  }
1452  else
1453  {
1454  bnode_t* left=node->left();
1455  bnode_t* right=node->right();
1456  cut_weakest_link(left,alpha);
1457  cut_weakest_link(right,alpha);
1458  node->data.num_leaves=left->data.num_leaves+right->data.num_leaves;
1459  node->data.weight_minus_branch=left->data.weight_minus_branch+right->data.weight_minus_branch;
1460 
1461  SG_UNREF(left);
1462  SG_UNREF(right);
1463  }
1464 }
1465 
1467 {
1468  if (node->data.num_leaves!=1)
1469  {
1470  bnode_t* left=node->left();
1471  bnode_t* right=node->right();
1472 
1473  form_t1(left);
1474  form_t1(right);
1475 
1476  node->data.num_leaves=left->data.num_leaves+right->data.num_leaves;
1477  node->data.weight_minus_branch=left->data.weight_minus_branch+right->data.weight_minus_branch;
1478  if (node->data.weight_minus_node==node->data.weight_minus_branch)
1479  {
1480  node->data.num_leaves=1;
1481  CDynamicObjectArray* children=new CDynamicObjectArray();
1482  node->set_children(children);
1483 
1484  SG_UNREF(children);
1485  }
1486 
1487  SG_UNREF(left);
1488  SG_UNREF(right);
1489  }
1490 }
1491 
1493 {
1497  m_pre_sort=false;
1498  m_types_set=false;
1499  m_weights_set=false;
1500  m_apply_cv_pruning=false;
1501  m_folds=5;
1503  SG_REF(m_alphas);
1504  m_max_depth=0;
1505  m_min_node_size=0;
1506  m_label_epsilon=1e-7;
1507 
1508  SG_ADD(&m_pre_sort,"m_pre_sort","presort", MS_NOT_AVAILABLE);
1509  SG_ADD(&m_sorted_features,"m_sorted_features", "sorted feats", MS_NOT_AVAILABLE);
1510  SG_ADD(&m_sorted_indices,"m_sorted_indices", "sorted indices", MS_NOT_AVAILABLE);
1511  SG_ADD(&m_nominal,"m_nominal", "feature types", MS_NOT_AVAILABLE);
1512  SG_ADD(&m_weights,"m_weights", "weights", MS_NOT_AVAILABLE);
1513  SG_ADD(&m_weights_set,"m_weights_set", "weights set", MS_NOT_AVAILABLE);
1514  SG_ADD(&m_types_set,"m_types_set", "feature types set", MS_NOT_AVAILABLE);
1515  SG_ADD(&m_apply_cv_pruning,"m_apply_cv_pruning", "apply cross validation pruning", MS_NOT_AVAILABLE);
1516  SG_ADD(&m_folds,"m_folds","number of subsets for cross validation", MS_NOT_AVAILABLE);
1517  SG_ADD(&m_max_depth,"m_max_depth","max allowed tree depth",MS_NOT_AVAILABLE)
1518  SG_ADD(&m_min_node_size,"m_min_node_size","min allowed node size",MS_NOT_AVAILABLE)
1519  SG_ADD(&m_label_epsilon,"m_label_epsilon","epsilon for labels",MS_NOT_AVAILABLE)
1520 }
void set_cv_pruning()
Definition: CARTree.h:211
CLabels * apply_from_current_node(CDenseFeatures< float64_t > *feats, bnode_t *current)
Definition: CARTree.cpp:1103
bool m_types_set
Definition: CARTree.h:441
virtual int32_t compute_best_attribute(const SGMatrix< float64_t > &mat, const SGVector< float64_t > &weights, CLabels *labels, SGVector< float64_t > &left, SGVector< float64_t > &right, SGVector< bool > &is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right, int32_t subset_size=0, const SGVector< int32_t > &active_indices=SGVector< index_t >())
Definition: CARTree.cpp:531
void range_fill(T start=0)
Definition: SGVector.cpp:171
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
The node of the tree structure forming a TreeMachine The node contains pointer to its parent and poin...
int32_t get_max_depth() const
Definition: CARTree.cpp:219
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:221
virtual ELabelType get_label_type() const =0
void set_weights(SGVector< float64_t > w)
Definition: CARTree.cpp:174
Real Labels are real-valued labels.
SGVector< bool > get_feature_types() const
Definition: CARTree.cpp:197
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_min_node_size() const
Definition: CARTree.cpp:230
void set_machine_problem_type(EProblemType mode)
Definition: CARTree.cpp:87
int32_t get_num_folds() const
Definition: CARTree.cpp:208
CDynamicObjectArray * prune_tree(CTreeMachine< CARTreeNodeData > *tree)
Definition: CARTree.cpp:1365
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
SGMatrix< index_t > m_sorted_indices
Definition: CARTree.h:435
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
static void qsort_index(T1 *output, T2 *index, uint32_t size)
Definition: Math.h:2202
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
Definition: CARTree.cpp:102
float64_t find_weakest_alpha(bnode_t *node)
Definition: CARTree.cpp:1415
void form_t1(bnode_t *node)
Definition: CARTree.cpp:1466
virtual bool is_label_valid(CLabels *lab) const
Definition: CARTree.cpp:92
Definition: SGMatrix.h:20
T * get_array() const
Definition: DynamicArray.h:408
CLabels * m_labels
Definition: Machine.h:361
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:376
static const float64_t EQ_DELTA
Definition: CARTree.h:419
bool m_apply_cv_pruning
Definition: CARTree.h:447
virtual ~CCARTree()
Definition: CARTree.cpp:68
CTreeMachineNode< CARTreeNodeData > * get_root()
Definition: TreeMachine.h:88
SGVector< bool > surrogate_split(SGMatrix< float64_t > data, SGVector< float64_t > weights, SGVector< bool > nm_left, int32_t attr)
Definition: CARTree.cpp:839
virtual bool train_machine(CFeatures *data=NULL)
Definition: CARTree.cpp:247
float64_t get_label(int32_t idx)
int32_t m_max_depth
Definition: CARTree.h:459
float64_t m_label_epsilon
Definition: CARTree.h:423
#define SG_REF(x)
Definition: SGObject.h:54
virtual void set_labels(CLabels *lab)
Definition: CARTree.cpp:73
index_t num_rows
Definition: SGMatrix.h:374
void set_root(CTreeMachineNode< CARTreeNodeData > *root)
Definition: TreeMachine.h:78
class to add subset support to another class. A CSubsetStackStack instance should be added and wrappe...
Definition: SubsetStack.h:37
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
Multiclass Labels for multi-class classification.
static bool fequals(const T &a, const T &b, const float64_t eps, bool tolerant=false)
Definition: Math.h:335
virtual void set_children(CDynamicObjectArray *children)
CSubset * get_last_subset() const
Definition: SubsetStack.h:98
int32_t size() const
Definition: SGVector.h:113
index_t vlen
Definition: SGVector.h:494
virtual CSubsetStack * get_subset_stack()
Definition: Features.cpp:334
void clear_feature_types()
Definition: CARTree.cpp:202
float64_t gain(SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
Definition: CARTree.cpp:1051
EProblemType
Definition: Machine.h:110
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
float64_t least_squares_deviation(const SGVector< float64_t > &labels, const SGVector< float64_t > &weights, float64_t &total_weight)
Definition: CARTree.cpp:1087
bool set_element(T e, int32_t idx1, int32_t idx2=0, int32_t idx3=0)
Definition: DynamicArray.h:306
virtual int32_t get_num_vectors() const
void set_min_node_size(int32_t nsize)
Definition: CARTree.cpp:235
void right(CBinaryTreeMachineNode *r)
void set_sorted_features(SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
Definition: CARTree.cpp:290
void handle_missing_vecs_for_continuous_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
Definition: CARTree.cpp:914
void set_num_folds(int32_t folds)
Definition: CARTree.cpp:213
float64_t gini_impurity_index(const SGVector< float64_t > &weighted_lab_classes, float64_t &total_weight)
Definition: CARTree.cpp:1077
double float64_t
Definition: common.h:50
int32_t m_min_node_size
Definition: CARTree.h:462
int32_t m_folds
Definition: CARTree.h:450
static SGVector< index_t > argsort(SGVector< T > vector)
Definition: Math.h:1599
bool m_weights_set
Definition: CARTree.h:444
virtual void remove_subset()
Definition: Labels.cpp:49
SGVector< float64_t > get_weights() const
Definition: CARTree.cpp:180
CTreeMachine * clone_tree()
Definition: TreeMachine.h:97
void clear_weights()
Definition: CARTree.cpp:185
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:39
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:354
virtual EFeatureClass get_feature_class() const =0
T * get_column_vector(index_t col) const
Definition: SGMatrix.h:113
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
SGMatrix< float64_t > m_sorted_features
Definition: CARTree.h:432
virtual CBinaryTreeMachineNode< CARTreeNodeData > * CARTtrain(CFeatures *data, SGVector< float64_t > weights, CLabels *labels, int32_t level)
Definition: CARTree.cpp:317
void set_max_depth(int32_t depth)
Definition: CARTree.cpp:224
void handle_missing_vecs_for_nominal_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
Definition: CARTree.cpp:971
structure to store data of a node of CART. This can be used as a template type in TreeMachineNode cla...
void pre_sort_features(CFeatures *data, SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
Definition: CARTree.cpp:297
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
void set_feature_types(SGVector< bool > ft)
Definition: CARTree.cpp:191
CBinaryTreeMachineNode< CARTreeNodeData > bnode_t
Definition: TreeMachine.h:55
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
SGVector< bool > m_nominal
Definition: CARTree.h:426
float64_t compute_error(CLabels *labels, CLabels *reference, SGVector< float64_t > weights)
Definition: CARTree.cpp:1327
void cut_weakest_link(bnode_t *node, float64_t alpha)
Definition: CARTree.cpp:1436
static const float64_t MIN_SPLIT_GAIN
Definition: CARTree.h:416
SGVector< T > clone() const
Definition: SGVector.cpp:207
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
Definition: CARTree.cpp:116
virtual bool has_subsets() const
Definition: SubsetStack.h:89
static float base
Definition: JLCoverTree.h:85
int32_t get_num_elements() const
Definition: DynamicArray.h:200
void prune_using_test_dataset(CDenseFeatures< float64_t > *feats, CLabels *gnd_truth, SGVector< float64_t > weights=SGVector< float64_t >())
Definition: CARTree.cpp:128
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
Definition: Redux.h:68
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
Definition: TreeMachine.h:48
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:84
SGVector< float64_t > m_weights
Definition: CARTree.h:429
static float32_t sqrt(float32_t x)
Definition: Math.h:459
Dense integer or floating point labels.
Definition: DenseLabels.h:35
CDynamicArray< float64_t > * m_alphas
Definition: CARTree.h:456
void left(CBinaryTreeMachineNode *l)
static int32_t unique(T *output, int32_t size)
Definition: SGVector.cpp:785
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
static const float64_t MAX_REAL_NUMBER
Definition: Math.h:2061
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
Definition: DynamicArray.h:212
void set_const(T const_elem)
Definition: SGVector.cpp:150
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
SGVector< float64_t > get_unique_labels(SGVector< float64_t > labels_vec, int32_t &n_ulabels)
Definition: CARTree.cpp:507
static T abs(T a)
Definition: Math.h:179
void set_label_epsilon(float64_t epsilon)
Definition: CARTree.cpp:241
void prune_by_cross_validation(CDenseFeatures< float64_t > *data, int32_t folds)
Definition: CARTree.cpp:1188
static void random_vector(T *vec, int32_t len, T min_value, T max_value)
Definition: SGVector.cpp:563
static const float64_t MISSING
Definition: CARTree.h:413
EProblemType m_mode
Definition: CARTree.h:453

SHOGUN Machine Learning Toolbox - Documentation