SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
CHAIDTree.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Parijat Mazumdar
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
34 
35 using namespace shogun;
36 
38 
41 {
42  init();
43 }
44 
45 CCHAIDTree::CCHAIDTree(int32_t dependent_vartype)
47 {
48  init();
49  m_dependent_vartype=dependent_vartype;
50 }
51 
52 CCHAIDTree::CCHAIDTree(int32_t dependent_vartype, SGVector<int32_t> feature_types, int32_t num_breakpoints)
54 {
55  init();
56  m_dependent_vartype=dependent_vartype;
57  set_feature_types(feature_types);
58  m_num_breakpoints=num_breakpoints;
59 }
60 
62 {
63 }
64 
66 {
67  switch (m_dependent_vartype)
68  {
69  case 0:
70  return PT_MULTICLASS;
71  case 1:
72  return PT_MULTICLASS;
73  case 2:
74  return PT_REGRESSION;
75  default:
76  SG_ERROR("Invalid dependent variable type set (%d). Problem type undefined\n",m_dependent_vartype);
77  }
78 
79  return PT_MULTICLASS;
80 }
81 
83 {
84  switch (m_dependent_vartype)
85  {
86  case 0:
87  return lab->get_label_type()==LT_MULTICLASS;
88  case 1:
89  return lab->get_label_type()==LT_MULTICLASS;
90  case 2:
91  return lab->get_label_type()==LT_REGRESSION;
92  default:
93  SG_ERROR("Invalid dependent variable type set (%d). Problem type undefined\n",m_dependent_vartype);
94  }
95 
96  return false;
97 }
98 
100 {
101  REQUIRE(data, "Data required for classification in apply_multiclass\n")
102 
103  return CLabelsFactory::to_multiclass(apply_tree(data));
104 }
105 
107 {
108  REQUIRE(data, "Data required for regression in apply_regression\n")
109 
110  return CLabelsFactory::to_regression(apply_tree(data));
111 }
112 
114 {
115  m_weights=w;
116  m_weights_set=true;
117 }
118 
120 {
121  if (!m_weights_set)
122  SG_ERROR("weights not set\n");
123 
124  return m_weights;
125 }
126 
128 {
129  m_weights=SGVector<float64_t>();
130  m_weights_set=false;
131 }
132 
134 {
135  m_feature_types=ft;
136 }
137 
139 {
140  return m_feature_types;
141 }
142 
144 {
145  m_feature_types=SGVector<int32_t>();
146 }
147 
149 {
150  REQUIRE(((var==0)||(var==1)||(var==2)), "Expected 0 or 1 or 2 as argument. %d received\n",var)
151  m_dependent_vartype=var;
152 }
153 
155 {
156  REQUIRE(data, "Data required for training\n")
157  REQUIRE(data->get_feature_class()==C_DENSE,"Dense data required for training\n")
158 
160 
161  REQUIRE(m_feature_types.vlen==feats->get_num_features(),"Either feature types are not set or the number of feature types specified"
162  " (%d here) is not the same as the number of features in data matrix (%d here)\n",m_feature_types.vlen,feats->get_num_features())
163 
164  if (m_weights_set)
165  {
166  REQUIRE(m_weights.vlen==feats->get_num_vectors(),"Length of weights vector (currently %d) should be same as"
167  " the number of vectors in data (presently %d)",m_weights.vlen,feats->get_num_vectors())
168  }
169  else
170  {
171  // all weights are equal to 1
172  m_weights=SGVector<float64_t>(feats->get_num_vectors());
173  m_weights.fill_vector(m_weights.vector,m_weights.vlen,1.0);
174  }
175 
176  // continuous to ordinal conversion - NOTE: data matrix gets updated
177  bool updated=continuous_to_ordinal(feats);
178 
179  SGVector<int32_t> feature_types_cache;
180  if (updated)
181  {
182  // change m_feature_types momentarily
183  feature_types_cache=m_feature_types.clone();
184  for (int32_t i=0;i<m_feature_types.vlen;i++)
185  {
186  if (m_feature_types[i]==2)
187  m_feature_types[i]=1;
188  }
189  }
190 
191  set_root(CHAIDtrain(data,m_weights,m_labels,0));
192 
193  // restore feature types
194  if (updated)
195  m_feature_types=feature_types_cache;
196 
197  return true;
198 }
199 
200 CTreeMachineNode<CHAIDTreeNodeData>* CCHAIDTree::CHAIDtrain(CFeatures* data, SGVector<float64_t> weights, CLabels* labels, int32_t level)
201 {
202  REQUIRE(data,"data matrix cannot be empty\n");
203  REQUIRE(labels,"labels cannot be NULL\n");
204 
205  node_t* node=new node_t();
206  SGVector<float64_t> labels_vec=(dynamic_cast<CDenseLabels*>(labels))->get_labels();
208  int32_t num_feats=mat.num_rows;
209  int32_t num_vecs=mat.num_cols;
210 
211  // calculate node label
212  if (m_dependent_vartype==2)
213  {
214  // sum_of_squared_deviation
215  node->data.weight_minus_node=sum_of_squared_deviation(labels_vec,weights,node->data.node_label);
216  node->data.total_weight=weights.sum(weights);
217  }
218  else if (m_dependent_vartype==0 || m_dependent_vartype==1)
219  {
220  SGVector<float64_t> lab=labels_vec.clone();
221  CMath::qsort(lab);
222  // stores max total weight for a single label
223  int32_t max=weights[0];
224  // stores one of the indices having max total weight
225  int32_t maxi=0;
226  int32_t c=weights[0];
227  for (int32_t i=1;i<lab.vlen;i++)
228  {
229  if (lab[i]==lab[i-1])
230  {
231  c+=weights[i];
232  }
233  else if (c>max)
234  {
235  max=c;
236  maxi=i-1;
237  c=weights[i];
238  }
239  else
240  {
241  c=weights[i];
242  }
243  }
244 
245  if (c>max)
246  {
247  max=c;
248  maxi=lab.vlen-1;
249  }
250 
251  node->data.node_label=lab[maxi];
252  node->data.total_weight=weights.sum(weights);
254 
255  }
256  else
257  {
258  SG_ERROR("dependent variable type should be either 0(nominal) or 1(ordinal) or 2(continuous)\n");
259  }
260 
261  // check stopping rules
262  // case 1 : all labels same
263  SGVector<float64_t> lab=labels_vec.clone();
264  int32_t unique=lab.unique(lab.vector,lab.vlen);
265  if (unique==1)
266  return node;
267 
268  // case 2 : all non-dependent attributes (not MISSING) are same
269  bool flag=true;
270  for (int32_t v=1;v<num_vecs;v++)
271  {
272  for (int32_t f=0;f<num_feats;f++)
273  {
274  if ((mat(f,v)!=MISSING) && (mat(f,v-1)!=MISSING))
275  {
276  if (mat(f,v)!=mat(f,v-1))
277  {
278  flag=false;
279  break;
280  }
281  }
282  }
283 
284  if (!flag)
285  break;
286  }
287 
288  if (flag)
289  return node;
290 
291  // case 3 : current tree depth is equal to user specified max
292  if (m_max_tree_depth>0)
293  {
294  if (level==m_max_tree_depth)
295  return node;
296  }
297 
298  // case 4 : node size is less than user-specified min node size
299  if (m_min_node_size>1)
300  {
301  if (num_vecs<m_min_node_size)
302  return node;
303  }
304 
305  // choose best attribute for splitting
307  SGVector<int32_t> cat_min;
308  int32_t attr_min=-1;
309  for (int32_t i=0;i<num_feats;i++)
310  {
311  SGVector<float64_t> feats(num_vecs);
312  for (int32_t j=0;j<num_vecs;j++)
313  feats[j]=mat(i,j);
314 
315  float64_t pv=0;
316  SGVector<int32_t> cat;
317  if (m_feature_types[i]==0)
318  cat=merge_categories_nominal(feats,labels_vec,weights,pv);
319  else if (m_feature_types[i]==1)
320  cat=merge_categories_ordinal(feats,labels_vec,weights,pv);
321  else
322  SG_ERROR("feature type supported are 0(nominal) and 1(ordinal). m_feature_types[%d] is set %d\n",i,m_feature_types[i])
323 
324  if (pv<min_pv)
325  {
326  min_pv=pv;
327  attr_min=i;
328  cat_min=cat;
329  }
330  }
331 
332  if (min_pv>m_alpha_split)
333  return node;
334 
335  // split
336  SGVector<float64_t> ufeats_best(num_vecs);
337  for (int32_t i=0;i<num_vecs;i++)
338  ufeats_best[i]=mat(attr_min,i);
339 
340  int32_t unum=ufeats_best.unique(ufeats_best.vector,ufeats_best.vlen);
341  for (int32_t i=0;i<cat_min.vlen;i++)
342  {
343  if (cat_min[i]!=i)
344  continue;
345 
347  for (int32_t j=0;j<num_vecs;j++)
348  {
349  for (int32_t k=0;k<unum;k++)
350  {
351  if (mat(attr_min,j)==ufeats_best[k])
352  {
353  if (cat_min[k]==i)
354  feat_index->push_back(j);
355  }
356  }
357  }
358 
359  SGVector<int32_t> subset(feat_index->get_num_elements());
360  SGVector<float64_t> subweights(feat_index->get_num_elements());
361  for (int32_t j=0;j<feat_index->get_num_elements();j++)
362  {
363  subset[j]=feat_index->get_element(j);
364  subweights[j]=weights[feat_index->get_element(j)];
365  }
366 
367  data->add_subset(subset);
368  labels->add_subset(subset);
369  node_t* child=CHAIDtrain(data,subweights,labels,level+1);
370  node->add_child(child);
371 
372  node->data.attribute_id=attr_min;
373  int32_t c=0;
374  SGVector<int32_t> feat_class=cat_min.clone();
375  for (int32_t j=0;j<feat_class.vlen;j++)
376  {
377  if (feat_class[j]!=j)
378  {
379  continue;
380  }
381  else if (j==c)
382  {
383  c++;
384  continue;
385  }
386 
387  for (int32_t k=j;k<feat_class.vlen;k++)
388  {
389  if (feat_class[k]==j)
390  feat_class[k]=c;
391  }
392 
393  c++;
394  }
395 
396  node->data.feature_class=feat_class;
398  for (int32_t j=0;j<unum;j++)
399  node->data.distinct_features[j]=ufeats_best[j];
400 
401  SG_UNREF(feat_index);
402  data->remove_subset();
403  labels->remove_subset();
404  }
405 
406  return node;
407 }
408 
409 SGVector<int32_t> CCHAIDTree::merge_categories_ordinal(SGVector<float64_t> feats, SGVector<float64_t> labels,
410  SGVector<float64_t> weights, float64_t &pv)
411 {
412  SGVector<float64_t> ufeats=feats.clone();
413  int32_t inum_cat=ufeats.unique(ufeats.vector,ufeats.vlen);
414  SGVector<int32_t> cat(inum_cat);
415  cat.range_fill(0);
416 
417  if (inum_cat==1)
418  {
419  pv=1.0;
420  return cat;
421  }
422 
423  bool missing=false;
424  if (ufeats[inum_cat-1]==MISSING)
425  {
426  missing=true;
427  inum_cat--;
428  }
429 
430  int32_t fnum_cat=inum_cat;
431 
432  // if chosen attribute (MISSING excluded) has 1 category only
433  if (inum_cat==1)
434  {
435  pv=adjusted_p_value(p_value(feats,labels,weights),2,2,1,true);
436  return cat;
437  }
438 
439  while(true)
440  {
441  if (fnum_cat==2)
442  break;
443 
444  // scan all allowable pairs of categories to find most similar one
445  int32_t cat_index_max=-1;
446  float64_t max_merge_pv=CMath::MIN_REAL_NUMBER;
447  for (int32_t i=0;i<inum_cat-1;i++)
448  {
449  if (cat[i]==cat[i+1])
450  continue;
451 
452  int32_t cat_index=i;
453 
454  // compute p-value
457  for (int32_t j=0;j<feats.vlen;j++)
458  {
459  for (int32_t k=0;k<inum_cat;k++)
460  {
461  if (feats[j]==ufeats[k])
462  {
463  if (cat[k]==cat[cat_index])
464  {
465  feat_index->push_back(j);
466  feat_cat->push_back(cat[cat_index]);
467  }
468  else if (cat[k]==cat[cat_index+1])
469  {
470  feat_index->push_back(j);
471  feat_cat->push_back(cat[cat_index+1]);
472  }
473  }
474  }
475  }
476 
477  SGVector<float64_t> subfeats(feat_index->get_num_elements());
478  SGVector<float64_t> sublabels(feat_index->get_num_elements());
479  SGVector<float64_t> subweights(feat_index->get_num_elements());
480  for (int32_t j=0;j<feat_index->get_num_elements();j++)
481  {
482  subfeats[j]=feat_cat->get_element(j);
483  sublabels[j]=labels[feat_index->get_element(j)];
484  subweights[j]=weights[feat_index->get_element(j)];
485  }
486 
487  float64_t pvalue=p_value(subfeats,sublabels,subweights);
488  if (pvalue>max_merge_pv)
489  {
490  max_merge_pv=pvalue;
491  cat_index_max=cat_index;
492  }
493 
494  SG_UNREF(feat_index);
495  SG_UNREF(feat_cat);
496  }
497 
498  if (max_merge_pv>m_alpha_merge)
499  {
500  // merge
501  int32_t cat2=cat[cat_index_max+1];
502  for (int32_t i=cat_index_max+1;i<inum_cat;i++)
503  {
504  if (cat2==cat[i])
505  cat[i]=cat[cat_index_max];
506  else
507  break;
508  }
509 
510  fnum_cat--;
511  }
512  else
513  {
514  break;
515  }
516  }
517 
518  SGVector<float64_t> feats_cat(feats.vlen);
519  for (int32_t i=0;i<feats.vlen;i++)
520  {
521  if (feats[i]==MISSING)
522  {
523  feats_cat[i]=MISSING;
524  continue;
525  }
526 
527  for (int32_t j=0;j<inum_cat;j++)
528  {
529  if (feats[i]==ufeats[j])
530  feats_cat[i]=cat[j];
531  }
532  }
533 
534  if (missing)
535  {
536  bool merged=handle_missing_ordinal(cat,feats_cat,labels,weights);
537  if (!merged)
538  fnum_cat+=1;
539 
540  pv=adjusted_p_value(p_value(feats_cat,labels,weights),inum_cat+1,fnum_cat,1,true);
541  }
542  else
543  {
544  pv=adjusted_p_value(p_value(feats_cat,labels,weights),inum_cat,fnum_cat,1,false);
545  }
546 
547  return cat;
548 }
549 
550 SGVector<int32_t> CCHAIDTree::merge_categories_nominal(SGVector<float64_t> feats, SGVector<float64_t> labels,
551  SGVector<float64_t> weights, float64_t &pv)
552 {
553  SGVector<float64_t> ufeats=feats.clone();
554  int32_t inum_cat=ufeats.unique(ufeats.vector,ufeats.vlen);
555  int32_t fnum_cat=inum_cat;
556 
557  SGVector<int32_t> cat(inum_cat);
558  cat.range_fill(0);
559 
560  // if chosen attribute X(feats here) has 1 category only
561  if (inum_cat==1)
562  {
563  pv=1.0;
564  return cat;
565  }
566 
567  while(true)
568  {
569  if (fnum_cat==2)
570  break;
571 
572  // assimilate all category labels left
574  for (int32_t i=0;i<cat.vlen;i++)
575  {
576  if (cat[i]==i)
577  leftcat->push_back(i);
578  }
579 
580  // consider all pairs for merging
581  float64_t max_merge_pv=CMath::MIN_REAL_NUMBER;
582  int32_t cat1_max=-1;
583  int32_t cat2_max=-1;
584  for (int32_t i=0;i<leftcat->get_num_elements()-1;i++)
585  {
586  for (int32_t j=i+1;j<leftcat->get_num_elements();j++)
587  {
590  for (int32_t k=0;k<feats.vlen;k++)
591  {
592  for (int32_t l=0;l<inum_cat;l++)
593  {
594  if (feats[k]==ufeats[l])
595  {
596  if (cat[l]==leftcat->get_element(i))
597  {
598  feat_index->push_back(k);
599  feat_cat->push_back(leftcat->get_element(i));
600  }
601  else if (cat[l]==leftcat->get_element(j))
602  {
603  feat_index->push_back(k);
604  feat_cat->push_back(leftcat->get_element(j));
605  }
606  }
607  }
608  }
609 
610  SGVector<float64_t> subfeats(feat_index->get_num_elements());
611  SGVector<float64_t> sublabels(feat_index->get_num_elements());
612  SGVector<float64_t> subweights(feat_index->get_num_elements());
613  for (int32_t k=0;k<feat_index->get_num_elements();k++)
614  {
615  subfeats[k]=feat_cat->get_element(k);
616  sublabels[k]=labels[feat_index->get_element(k)];
617  subweights[k]=weights[feat_index->get_element(k)];
618  }
619 
620  float64_t pvalue=p_value(subfeats,sublabels,subweights);
621  if (pvalue>max_merge_pv)
622  {
623  max_merge_pv=pvalue;
624  cat1_max=leftcat->get_element(i);
625  cat2_max=leftcat->get_element(j);
626  }
627 
628  SG_UNREF(feat_index);
629  SG_UNREF(feat_cat);
630  }
631  }
632 
633  SG_UNREF(leftcat);
634 
635  if (max_merge_pv>m_alpha_merge)
636  {
637  // merge
638  for (int32_t i=0;i<cat.vlen;i++)
639  {
640  if (cat2_max==cat[i])
641  cat[i]=cat1_max;
642  }
643 
644  fnum_cat--;
645  }
646  else
647  {
648  break;
649  }
650  }
651 
652  SGVector<float64_t> feats_cat(feats.vlen);
653  for (int32_t i=0;i<feats.vlen;i++)
654  {
655  for (int32_t j=0;j<inum_cat;j++)
656  {
657  if (feats[i]==ufeats[j])
658  feats_cat[i]=cat[j];
659  }
660  }
661 
662  pv=adjusted_p_value(p_value(feats_cat,labels,weights),inum_cat,fnum_cat,0,false);
663  return cat;
664 }
665 
666 CLabels* CCHAIDTree::apply_tree(CFeatures* data)
667 {
669 
670  // modify test data matrix (continuous to ordinal)
671  if (m_cont_breakpoints.num_cols>0)
672  modify_data_matrix(feats);
673 
675  return apply_from_current_node(fmat, m_root);
676 }
677 
678 CLabels* CCHAIDTree::apply_from_current_node(SGMatrix<float64_t> fmat, node_t* current)
679 {
680  REQUIRE(current != NULL, "The tree cannot be empty.\n");
681  int32_t num_vecs=fmat.num_cols;
682 
683  SGVector<float64_t> labels(num_vecs);
684  for (int32_t i=0;i<num_vecs;i++)
685  {
686  node_t* node=current;
687  SG_REF(node);
688  CDynamicObjectArray* children=node->get_children();
689  // while leaf node not reached
690  while(children->get_num_elements()>0)
691  {
692  // find feature class (or index of child node) of chosen vector in current node
693  int32_t index=-1;
694  for (int32_t j=0;j<(node->data.distinct_features).vlen;j++)
695  {
696  if (fmat(node->data.attribute_id,i)==node->data.distinct_features[j])
697  {
698  index=j;
699  break;
700  }
701  }
702 
703  if (index==-1)
704  break;
705 
706  CSGObject* el=children->get_element(node->data.feature_class[index]);
707  if (el!=NULL)
708  {
709  SG_UNREF(node);
710  node=dynamic_cast<node_t*>(el);
711  }
712  else
713  SG_ERROR("%d child is expected to be present. But it is NULL\n",index)
714 
715  SG_UNREF(children);
716  children=node->get_children();
717  }
718 
719  labels[i]=node->data.node_label;
720  SG_UNREF(node);
721  SG_UNREF(children);
722  }
723 
724  switch (get_machine_problem_type())
725  {
726  case PT_MULTICLASS:
727  return new CMulticlassLabels(labels);
728  case PT_REGRESSION:
729  return new CRegressionLabels(labels);
730  default:
731  SG_ERROR("Undefined problem type\n")
732  }
733 
734  return new CMulticlassLabels();
735 }
736 
737 bool CCHAIDTree::handle_missing_ordinal(SGVector<int32_t> cat, SGVector<float64_t> feats, SGVector<float64_t> labels,
738  SGVector<float64_t> weights)
739 {
740  // assimilate category indices other than missing (last cell of cat vector stores category index for missing)
741  // sanity check
742  REQUIRE(cat[cat.vlen-1]==cat.vlen-1,"last category is expected to be stored for MISSING. Hence it is expected to be un-merged\n")
744  for (int32_t i=0;i<cat.vlen-1;i++)
745  {
746  if (cat[i]==i)
747  cat_ind->push_back(i);
748  }
749 
750  // find most similar category to MISSING
751  float64_t max_pv_pair=CMath::MIN_REAL_NUMBER;
752  int32_t cindex_max=-1;
753  for (int32_t i=0;i<cat_ind->get_num_elements();i++)
754  {
756  for (int32_t j=0;j<feats.vlen;j++)
757  {
758  if ((feats[j]==cat_ind->get_element(i)) || feats[j]==MISSING)
759  feat_index->push_back(j);
760  }
761 
762  SGVector<float64_t> subfeats(feat_index->get_num_elements());
763  SGVector<float64_t> sublabels(feat_index->get_num_elements());
764  SGVector<float64_t> subweights(feat_index->get_num_elements());
765  for (int32_t j=0;j<feat_index->get_num_elements();j++)
766  {
767  subfeats[j]=feats[feat_index->get_element(j)];
768  sublabels[j]=labels[feat_index->get_element(j)];
769  subweights[j]=weights[feat_index->get_element(j)];
770  }
771 
772  float64_t pvalue=p_value(subfeats,sublabels,subweights);
773  if (pvalue>max_pv_pair)
774  {
775  max_pv_pair=pvalue;
776  cindex_max=cat_ind->get_element(i);
777  }
778 
779  SG_UNREF(feat_index);
780  }
781 
782  // compare if MISSING being merged is better than not being merged
783  SGVector<float64_t> feats_copy(feats.vlen);
784  for (int32_t i=0;i<feats.vlen;i++)
785  {
786  if (feats[i]==MISSING)
787  feats_copy[i]=cindex_max;
788  else
789  feats_copy[i]=feats[i];
790  }
791 
792  float64_t pv_merged=p_value(feats_copy, labels, weights);
793  float64_t pv_unmerged=p_value(feats, labels, weights);
794  if (pv_merged>pv_unmerged)
795  {
796  cat[cat.vlen-1]=cindex_max;
797  for (int32_t i=0;i<feats.vlen;i++)
798  {
799  if (feats[i]==MISSING)
800  feats[i]=cindex_max;
801  }
802 
803  return true;
804  }
805 
806  return false;
807 }
808 
809 float64_t CCHAIDTree::adjusted_p_value(float64_t up_value, int32_t inum_cat, int32_t fnum_cat, int32_t ft, bool is_missing)
810 {
811 
812  if (inum_cat==fnum_cat)
813  return up_value;
814 
815  switch (ft)
816  {
817  case 0:
818  {
819  float64_t sum=0.;
820  for (int32_t v=0;v<fnum_cat;v++)
821  {
822  float64_t lterm=inum_cat*CMath::log(fnum_cat-v);
823  for (int32_t j=1;j<=v;j++)
824  lterm-=CMath::log(j);
825 
826  for (int32_t j=1;j<=fnum_cat-v;j++)
827  lterm-=CMath::log(j);
828 
829  if (v%2==0)
830  sum+=CMath::exp(lterm);
831  else
832  sum-=CMath::exp(lterm);
833  }
834 
835  return sum*up_value;
836  }
837  case 1:
838  {
839  if (!is_missing)
840  return CMath::nchoosek(inum_cat-1,fnum_cat-1)*up_value;
841  else
842  return up_value*(CMath::nchoosek(inum_cat-2,fnum_cat-2)+fnum_cat*CMath::nchoosek(inum_cat-2,fnum_cat-1));
843  }
844  default:
845  SG_ERROR("Feature type must be either 0 (nominal) or 1 (ordinal). It is currently set as %d\n",ft)
846  }
847 
848  return 0.0;
849 }
850 
851 float64_t CCHAIDTree::p_value(SGVector<float64_t> feat, SGVector<float64_t> labels, SGVector<float64_t> weights)
852 {
853  switch (m_dependent_vartype)
854  {
855  case 0:
856  {
857  int32_t r=0;
858  int32_t c=0;
859  float64_t x2=pchi2_statistic(feat,labels,weights,r,c);
860 
861  // chi2 is not defined for second argument to be 0
862  // (which might happen here if x2 is 0)
863  if (x2 == 0)
864  return 1;
865  else
866  return 1-CStatistics::chi2_cdf(x2,(r-1)*(c-1));
867  }
868  case 1:
869  {
870  int32_t r=0;
871  int32_t c=0;
872  float64_t h2=likelihood_ratio_statistic(feat,labels,weights,r,c);
873  return 1-CStatistics::chi2_cdf(h2,(r-1));
874  }
875  case 2:
876  {
877  int32_t nf=feat.vlen;
878  int32_t num_cat=0;
879  float64_t f=anova_f_statistic(feat,labels,weights,num_cat);
880 
881  if (nf==num_cat)
882  return 1.0;
883 
884  return 1-CStatistics::fdistribution_cdf(f,num_cat-1,nf-num_cat);
885  }
886  default:
887  SG_ERROR("Dependent variable type must be either 0 or 1 or 2. It is currently set as %d\n",m_dependent_vartype)
888  }
889 
890  return -1.0;
891 }
892 
893 float64_t CCHAIDTree::anova_f_statistic(SGVector<float64_t> feat, SGVector<float64_t> labels, SGVector<float64_t> weights, int32_t &r)
894 {
895  // compute y_bar
896  float64_t y_bar=0.;
897  for (int32_t i=0;i<labels.vlen;i++)
898  y_bar+=labels[i]*weights[i];
899 
900  y_bar/=weights.sum(weights);
901 
902  SGVector<float64_t> ufeat=feat.clone();
903  r=ufeat.unique(ufeat.vector,ufeat.vlen);
904 
905  // compute y_i_bar
906  SGVector<float64_t> numer(r);
907  SGVector<float64_t> denom(r);
908  numer.zero();
909  denom.zero();
910  for (int32_t n=0;n<feat.vlen;n++)
911  {
912  for (int32_t i=0;i<r;i++)
913  {
914  if (feat[n]==ufeat[i])
915  {
916  numer[i]+=weights[n]*labels[n];
917  denom[i]+=weights[n];
918  break;
919  }
920  }
921  }
922 
923  // compute f statistic
924  float64_t nu=0.;
925  float64_t de=0.;
926  for (int32_t i=0;i<r;i++)
927  {
928  for (int32_t n=0;n<feat.vlen;n++)
929  {
930  if (feat[n]==ufeat[i])
931  {
932  nu+=weights[n]*CMath::pow(((numer[i]/denom[i])-y_bar),2);
933  de+=weights[n]*CMath::pow((labels[n]-(numer[i]/denom[i])),2);
934  }
935  }
936  }
937 
938  nu/=(r-1.0);
939  if (de==0)
940  return nu;
941 
942  de/=(feat.vlen-r-0.f);
943  return nu/de;
944 }
945 
946 float64_t CCHAIDTree::likelihood_ratio_statistic(SGVector<float64_t> feat, SGVector<float64_t> labels,
947  SGVector<float64_t> weights, int32_t &r, int32_t &c)
948 {
949  SGVector<float64_t> ufeat=feat.clone();
950  SGVector<float64_t> ulabels=labels.clone();
951  r=ufeat.unique(ufeat.vector,ufeat.vlen);
952  c=ulabels.unique(ulabels.vector,ulabels.vlen);
953 
954  // contingency table, weight table
955  SGMatrix<int32_t> ct(r,c);
956  ct.zero();
957  SGMatrix<float64_t> wt(r,c);
958  wt.zero();
959  for (int32_t i=0;i<feat.vlen;i++)
960  {
961  // calculate row
962  int32_t row=-1;
963  for (int32_t j=0;j<r;j++)
964  {
965  if (feat[i]==ufeat[j])
966  {
967  row=j;
968  break;
969  }
970  }
971 
972  // calculate col
973  int32_t col=-1;
974  for (int32_t j=0;j<c;j++)
975  {
976  if (labels[i]==ulabels[j])
977  {
978  col=j;
979  break;
980  }
981  }
982 
983  ct(row,col)++;
984  wt(row,col)+=weights[i];
985  }
986 
987  SGMatrix<float64_t> expmat_indep=expected_cf_indep_model(ct,wt);
988 
989  SGVector<float64_t> score(c);
990  score.range_fill(1.0);
991  SGMatrix<float64_t> expmat_row_effects=expected_cf_row_effects_model(ct,wt,score);
992 
993  float64_t ret=0.;
994  for (int32_t i=0;i<r;i++)
995  {
996  for (int32_t j=0;j<c;j++)
997  ret+=expmat_row_effects(i,j)*CMath::log(expmat_row_effects(i,j)/expmat_indep(i,j));
998  }
999 
1000  return 2*ret;
1001 }
1002 
1003 float64_t CCHAIDTree::pchi2_statistic(SGVector<float64_t> feat, SGVector<float64_t> labels, SGVector<float64_t> weights,
1004  int32_t &r, int32_t &c)
1005 {
1006  SGVector<float64_t> ufeat=feat.clone();
1007  SGVector<float64_t> ulabels=labels.clone();
1008  r=ufeat.unique(ufeat.vector,ufeat.vlen);
1009  c=ulabels.unique(ulabels.vector,ulabels.vlen);
1010 
1011  // contingency table, weight table
1012  SGMatrix<int32_t> ct(r,c);
1013  ct.zero();
1014  SGMatrix<float64_t> wt(r,c);
1015  wt.zero();
1016  for (int32_t i=0;i<feat.vlen;i++)
1017  {
1018  // calculate row
1019  int32_t row=-1;
1020  for (int32_t j=0;j<r;j++)
1021  {
1022  if (feat[i]==ufeat[j])
1023  {
1024  row=j;
1025  break;
1026  }
1027  }
1028 
1029  // calculate col
1030  int32_t col=-1;
1031  for (int32_t j=0;j<c;j++)
1032  {
1033  if (labels[i]==ulabels[j])
1034  {
1035  col=j;
1036  break;
1037  }
1038  }
1039 
1040  ct(row,col)++;
1041  wt(row,col)+=weights[i];
1042  }
1043 
1044  SGMatrix<float64_t> expected_cf=expected_cf_indep_model(ct,wt);
1045 
1046  float64_t ret=0.;
1047  for (int32_t i=0;i<r;i++)
1048  {
1049  for (int32_t j=0;j<c;j++)
1050  ret+=CMath::pow((ct(i,j)-expected_cf(i,j)),2)/expected_cf(i,j);
1051  }
1052 
1053  return ret;
1054 }
1055 
1056 SGMatrix<float64_t> CCHAIDTree::expected_cf_row_effects_model(SGMatrix<int32_t> ct, SGMatrix<float64_t> wt, SGVector<float64_t> score)
1057 {
1058  int32_t r=ct.num_rows;
1059  int32_t c=ct.num_cols;
1060 
1061  // compute row sum(n_i.'s) and column sum(n_.j's)
1062  SGVector<int32_t> row_sum(r);
1063  SGVector<int32_t> col_sum(c);
1064  for (int32_t i=0;i<r;i++)
1065  {
1066  int32_t sum=0;
1067  for (int32_t j=0;j<c;j++)
1068  sum+=ct(i,j);
1069 
1070  row_sum[i]=sum;
1071  }
1072  for (int32_t i=0;i<c;i++)
1073  {
1074  int32_t sum=0;
1075  for (int32_t j=0;j<r;j++)
1076  sum+=ct(j,i);
1077 
1078  col_sum[i]=sum;
1079  }
1080 
1081  // compute s_bar
1082  float64_t numer=0.;
1083  float64_t denom=0.;
1084  for (int32_t j=0;j<c;j++)
1085  {
1086  float64_t w_j=0.;
1087  for (int32_t i=0;i<r;i++)
1088  w_j+=wt(i,j);
1089 
1090  denom+=w_j;
1091  numer+=w_j*score[j];
1092  }
1093 
1094  float64_t s_bar=numer/denom;
1095 
1096  // element-wise normalize and invert weight matrix w_ij(new)=n_ij/w_ij(old)
1097  for (int32_t i=0;i<r;i++)
1098  {
1099  for (int32_t j=0;j<c;j++)
1100  wt(i,j)=(ct(i,j)-0.f)/wt(i,j);
1101  }
1102 
1103  SGMatrix<float64_t> m_k=wt.clone();
1104  SGVector<float64_t> alpha(r);
1105  SGVector<float64_t> beta(c);
1106  SGVector<float64_t> gamma(r);
1107  alpha.fill_vector(alpha.vector,alpha.vlen,1.0);
1108  beta.fill_vector(beta.vector,beta.vlen,1.0);
1109  gamma.fill_vector(gamma.vector,gamma.vlen,1.0);
1110  float64_t epsilon=1e-8;
1111  while(true)
1112  {
1113  // update alpha
1114  for (int32_t i=0;i<r;i++)
1115  {
1116  float64_t sum=0.;
1117  for (int32_t j=0;j<c;j++)
1118  sum+=m_k(i,j);
1119 
1120  alpha[i]*=(row_sum[i]-0.f)/sum;
1121  }
1122 
1123  // update beta
1124  for (int32_t j=0;j<c;j++)
1125  {
1126  float64_t sum=0.;
1127  for (int32_t i=0;i<r;i++)
1128  sum+=wt(i,j)*alpha[i]*CMath::pow(gamma[i],(score[j]-s_bar));
1129 
1130  beta[j]=(col_sum[j]-0.f)/sum;
1131  }
1132 
1133  // compute g_i for updating gamma
1134  SGVector<float64_t> g(r);
1135  SGMatrix<float64_t> m_star(r,c);
1136  for (int32_t i=0;i<r;i++)
1137  {
1138  for (int32_t j=0;j<c;j++)
1139  m_star(i,j)=wt(i,j)*alpha[i]*beta[j]*CMath::pow(gamma[i],score[j]-s_bar);
1140  }
1141 
1142  for (int32_t i=0;i<r;i++)
1143  {
1144  numer=0.;
1145  denom=0.;
1146  for (int32_t j=0;j<c;j++)
1147  {
1148  numer+=(score[j]-s_bar)*(ct(i,j)-m_star(i,j));
1149  denom+=CMath::pow((score[j]-s_bar),2)*m_star(i,j);
1150  }
1151 
1152  g[i]=1+numer/denom;
1153  }
1154 
1155  // update gamma
1156  for (int32_t i=0;i<r;i++)
1157  gamma[i]=(g[i]>0)?gamma[i]*g[i]:gamma[i];
1158 
1159  // update m_k
1160  SGMatrix<float64_t> m_kplus(r,c);
1161  float64_t max_diff=0.;
1162  for (int32_t i=0;i<r;i++)
1163  {
1164  for (int32_t j=0;j<c;j++)
1165  {
1166  m_kplus(i,j)=wt(i,j)*alpha[i]*beta[j]*CMath::pow(gamma[i],(score[j]-s_bar));
1167  float64_t abs_diff=CMath::abs(m_kplus(i,j)-m_k(i,j));
1168  if (abs_diff>max_diff)
1169  max_diff=abs_diff;
1170  }
1171  }
1172 
1173  m_k=m_kplus;
1174  if (max_diff<epsilon)
1175  break;
1176  }
1177 
1178  return m_k;
1179 }
1180 
1181 SGMatrix<float64_t> CCHAIDTree::expected_cf_indep_model(SGMatrix<int32_t> ct, SGMatrix<float64_t> wt)
1182 {
1183  int32_t r=ct.num_rows;
1184  int32_t c=ct.num_cols;
1185 
1186  // compute row sum(n_i.'s) and column sum(n_.j's)
1187  SGVector<int32_t> row_sum(r);
1188  SGVector<int32_t> col_sum(c);
1189  for (int32_t i=0;i<r;i++)
1190  {
1191  int32_t sum=0;
1192  for (int32_t j=0;j<c;j++)
1193  sum+=ct(i,j);
1194 
1195  row_sum[i]=sum;
1196  }
1197  for (int32_t i=0;i<c;i++)
1198  {
1199  int32_t sum=0;
1200  for (int32_t j=0;j<r;j++)
1201  sum+=ct(j,i);
1202 
1203  col_sum[i]=sum;
1204  }
1205 
1206  SGMatrix<float64_t> ret(r,c);
1207 
1208  // if all weights are 1 - m_ij=n_i.*n_.j/n..
1209  if (!m_weights_set)
1210  {
1211  int32_t total_sum=(r<=c)?row_sum.sum(row_sum):col_sum.sum(col_sum);
1212 
1213  for (int32_t i=0;i<r;i++)
1214  {
1215  for (int32_t j=0;j<c;j++)
1216  ret(i,j)=(row_sum[i]*col_sum[j]-0.f)/(total_sum-0.f);
1217  }
1218  }
1219  else
1220  {
1221  // element-wise normalize and invert weight matrix w_ij(new)=n_ij/w_ij(old)
1222  for (int32_t i=0;i<r;i++)
1223  {
1224  for (int32_t j=0;j<c;j++)
1225  wt(i,j)=(ct(i,j)-0.f)/wt(i,j);
1226  }
1227 
1228  // iteratively estimate mij
1229  SGMatrix<float64_t> m_k=wt.clone();
1230  SGVector<float64_t> alpha(r);
1231  SGVector<float64_t> beta(c);
1232  alpha.fill_vector(alpha.vector,alpha.vlen,1.0);
1233  beta.fill_vector(beta.vector,beta.vlen,1.0);
1234  float64_t epsilon=1e-8;
1235  while (true)
1236  {
1237  // update alpha
1238  for (int32_t i=0;i<r;i++)
1239  {
1240  float64_t sum=0.;
1241  for (int32_t j=0;j<c;j++)
1242  sum+=m_k(i,j);
1243 
1244  alpha[i]*=(row_sum[i]-0.f)/sum;
1245  }
1246 
1247  // update beta
1248  for (int32_t j=0;j<c;j++)
1249  {
1250  float64_t sum=0.;
1251  for (int32_t i=0;i<r;i++)
1252  sum+=wt(i,j)*alpha[i];
1253 
1254  beta[j]=(col_sum[j]-0.f)/sum;
1255  }
1256 
1257  // update m_k
1258  SGMatrix<float64_t> m_kplus(r,c);
1259  float64_t max_diff=0.0;
1260  for (int32_t i=0;i<r;i++)
1261  {
1262  for (int32_t j=0;j<c;j++)
1263  {
1264  m_kplus(i,j)=wt(i,j)*alpha[i]*beta[j];
1265  float64_t abs_diff=CMath::abs(m_kplus(i,j)-m_k(i,j));
1266  if (abs_diff>max_diff)
1267  max_diff=abs_diff;
1268  }
1269  }
1270 
1271  m_k=m_kplus;
1272 
1273  if (max_diff<epsilon)
1274  break;
1275  }
1276 
1277  ret=m_k;
1278  }
1279 
1280  return ret;
1281 }
1282 
1283 float64_t CCHAIDTree::sum_of_squared_deviation(SGVector<float64_t> lab, SGVector<float64_t> weights, float64_t &mean)
1284 {
1285  mean=0;
1286  float64_t total_weight=0;
1287  for (int32_t i=0;i<lab.vlen;i++)
1288  {
1289  mean+=lab[i]*weights[i];
1290  total_weight+=weights[i];
1291  }
1292 
1293  mean/=total_weight;
1294  float64_t dev=0;
1295  for (int32_t i=0;i<lab.vlen;i++)
1296  dev+=weights[i]*(lab[i]-mean)*(lab[i]-mean);
1297 
1298  return dev;
1299 }
1300 
1301 bool CCHAIDTree::continuous_to_ordinal(CDenseFeatures<float64_t>* feats)
1302 {
1303  // assimilate continuous breakpoints
1304  int32_t count_cont=0;
1305  for (int32_t i=0;i<feats->get_num_features();i++)
1306  {
1307  if (m_feature_types[i]==2)
1308  count_cont++;
1309  }
1310 
1311  if (count_cont==0)
1312  return false;
1313 
1314  REQUIRE(m_num_breakpoints>0,"Number of breakpoints for continuous to ordinal conversion not set.\n")
1315 
1316  SGVector<int32_t> cont_ind(count_cont);
1317  int32_t ci=0;
1318  for (int32_t i=0;i<feats->get_num_features();i++)
1319  {
1320  if (m_feature_types[i]==2)
1321  cont_ind[ci++]=i;
1322  }
1323 
1324  // form breakpoints matrix
1325  m_cont_breakpoints=SGMatrix<float64_t>(m_num_breakpoints,count_cont);
1326  int32_t bin_size=feats->get_num_vectors()/m_num_breakpoints;
1327  for (int32_t i=0;i<count_cont;i++)
1328  {
1329  int32_t left=feats->get_num_vectors()%m_num_breakpoints;
1330  int32_t end_pt=-1;
1331 
1332  SGVector<float64_t> values(feats->get_num_vectors());
1333  for (int32_t j=0;j<values.vlen;j++)
1334  values[j]=feats->get_feature_vector(j)[cont_ind[i]];
1335 
1336  CMath::qsort(values);
1337 
1338  for (int32_t j=0;j<m_num_breakpoints;j++)
1339  {
1340  if (left>0)
1341  {
1342  left--;
1343  end_pt+=bin_size+1;
1344  m_cont_breakpoints(j,i)=values[end_pt];
1345  }
1346  else
1347  {
1348  end_pt+=bin_size;
1349  m_cont_breakpoints(j,i)=values[end_pt];
1350  }
1351  }
1352  }
1353 
1354  // update data matrix
1355  modify_data_matrix(feats);
1356 
1357  return true;
1358 }
1359 
1360 void CCHAIDTree::modify_data_matrix(CDenseFeatures<float64_t>* feats)
1361 {
1362  int32_t c=0;
1363  for (int32_t i=0;i<feats->get_num_features();i++)
1364  {
1365  if (m_feature_types[i]!=2)
1366  continue;
1367 
1368  // continuous to ordinal conversion
1369  for (int32_t j=0;j<feats->get_num_vectors();j++)
1370  {
1371  for (int32_t k=0;k<m_num_breakpoints;k++)
1372  {
1373  if (feats->get_feature_vector(j)[i]<=m_cont_breakpoints(k,c))
1374  {
1375  feats->get_feature_vector(j)[i]=m_cont_breakpoints(k,c);
1376  break;
1377  }
1378  }
1379  }
1380 
1381  c++;
1382  }
1383 }
1384 
1385 void CCHAIDTree::init()
1386 {
1387  m_feature_types=SGVector<int32_t>();
1388  m_weights=SGVector<float64_t>();
1389  m_dependent_vartype=0;
1390  m_weights_set=false;
1391  m_max_tree_depth=0;
1392  m_min_node_size=0;
1393  m_alpha_merge=0.05;
1394  m_alpha_split=0.05;
1395  m_cont_breakpoints=SGMatrix<float64_t>();
1396  m_num_breakpoints=0;
1397 
1398  SG_ADD(&m_weights,"m_weights", "weights", MS_NOT_AVAILABLE);
1399  SG_ADD(&m_weights_set,"m_weights_set", "weights set", MS_NOT_AVAILABLE);
1400  SG_ADD(&m_feature_types,"m_feature_types", "feature types", MS_NOT_AVAILABLE);
1401  SG_ADD(&m_dependent_vartype,"m_dependent_vartype", "dependent variable type", MS_NOT_AVAILABLE);
1402  SG_ADD(&m_max_tree_depth,"m_max_tree_depth", "max tree depth", MS_NOT_AVAILABLE);
1403  SG_ADD(&m_min_node_size,"m_min_node_size", "min node size", MS_NOT_AVAILABLE);
1404  SG_ADD(&m_alpha_merge,"m_alpha_merge", "alpha-merge", MS_NOT_AVAILABLE);
1405  SG_ADD(&m_alpha_split,"m_alpha_split", "alpha-split", MS_NOT_AVAILABLE);
1406  SG_ADD(&m_cont_breakpoints,"m_cont_breakpoints", "breakpoints in continuous attributes", MS_NOT_AVAILABLE);
1407  SG_ADD(&m_num_breakpoints,"m_num_breakpoints", "number of breakpoints", MS_NOT_AVAILABLE);
1408 }
CTreeMachineNode< CHAIDTreeNodeData > node_t
Definition: TreeMachine.h:52
void range_fill(T start=0)
Definition: SGVector.cpp:171
virtual ~CCHAIDTree()
Definition: CHAIDTree.cpp:61
static CRegressionLabels * to_regression(CLabels *base_labels)
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:221
structure to store data of a node of CHAID. This can be used as a template type in TreeMachineNode cl...
virtual ELabelType get_label_type() const =0
SGVector< int32_t > feature_class
Real Labels are real-valued labels.
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_num_features() const
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
Definition: CHAIDTree.cpp:99
SGVector< float64_t > distinct_features
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static float64_t fdistribution_cdf(float64_t x, float64_t d1, float64_t d2)
Definition: Statistics.cpp:537
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual bool train_machine(CFeatures *data=NULL)
Definition: CHAIDTree.cpp:154
SGMatrix< ST > get_feature_matrix()
SGMatrix< T > clone()
Definition: SGMatrix.cpp:256
static const float64_t MIN_REAL_NUMBER
Definition: Math.h:2062
CLabels * m_labels
Definition: Machine.h:361
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:376
virtual void add_child(CTreeMachineNode *child)
CTreeMachineNode< CHAIDTreeNodeData > * m_root
Definition: TreeMachine.h:156
#define SG_REF(x)
Definition: SGObject.h:54
void set_dependent_vartype(int32_t var)
Definition: CHAIDTree.cpp:148
index_t num_rows
Definition: SGMatrix.h:374
void set_root(CTreeMachineNode< CHAIDTreeNodeData > *root)
Definition: TreeMachine.h:78
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
Multiclass Labels for multi-class classification.
index_t vlen
Definition: SGVector.h:494
EProblemType
Definition: Machine.h:110
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
virtual int32_t get_num_vectors() const
virtual EProblemType get_machine_problem_type() const
Definition: CHAIDTree.cpp:65
double float64_t
Definition: common.h:50
virtual void remove_subset()
Definition: Labels.cpp:49
virtual void add_subset(SGVector< index_t > subset)
Definition: Labels.cpp:39
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:354
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
virtual CDynamicObjectArray * get_children()
SGVector< int32_t > get_feature_types() const
Definition: CHAIDTree.cpp:138
virtual bool is_label_valid(CLabels *lab) const
Definition: CHAIDTree.cpp:82
void clear_feature_types()
Definition: CHAIDTree.cpp:143
static int64_t nchoosek(int32_t n, int32_t k)
Definition: Math.h:1191
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
The class Features is the base class of all feature objects.
Definition: Features.h:68
static float64_t exp(float64_t x)
Definition: Math.h:621
void set_feature_types(SGVector< int32_t > ft)
Definition: CHAIDTree.cpp:133
static float64_t log(float64_t v)
Definition: Math.h:922
static CDenseFeatures * obtain_from_generic(CFeatures *const base_features)
SGVector< T > clone() const
Definition: SGVector.cpp:207
int32_t get_num_elements() const
Definition: DynamicArray.h:200
SGVector< float64_t > get_weights() const
Definition: CHAIDTree.cpp:119
CSGObject * get_element(int32_t index) const
static const float64_t MISSING
Definition: CHAIDTree.h:393
Matrix::Scalar max(Matrix m)
Definition: Redux.h:68
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
Definition: TreeMachine.h:48
#define SG_ADD(...)
Definition: SGObject.h:84
static float64_t chi2_cdf(float64_t x, float64_t k)
Definition: Statistics.cpp:374
Dense integer or floating point labels.
Definition: DenseLabels.h:35
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
Definition: CHAIDTree.cpp:106
static CMulticlassLabels * to_multiclass(CLabels *base_labels)
static int32_t unique(T *output, int32_t size)
Definition: SGVector.cpp:785
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
static const float64_t MAX_REAL_NUMBER
Definition: Math.h:2061
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
Definition: DynamicArray.h:212
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
void set_weights(SGVector< float64_t > w)
Definition: CHAIDTree.cpp:113
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation