36 using namespace Eigen;
39 const float64_t CCARTree::MISSING=CMath::MAX_REAL_NUMBER;
41 const float64_t CCARTree::MIN_SPLIT_GAIN=1e-7;
80 SG_ERROR(
"label type supplied is not supported\n")
104 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
109 REQUIRE(current,
"Tree machine not yet trained.\n");
118 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
145 root=
dynamic_cast<bnode_t*
>(element);
164 root=
dynamic_cast<bnode_t*
>(element);
166 SG_ERROR(
"%d element is NULL\n",min_index);
215 REQUIRE(folds>1,
"Number of folds is expected to be greater than 1. Supplied value is %d\n",folds)
226 REQUIRE(depth>0,
"Max allowed tree depth should be greater than 0. Supplied value is %d\n",depth)
237 REQUIRE(nsize>0,
"Min allowed node size should be greater than 0. Supplied value is %d\n",nsize)
243 REQUIRE(ep>=0,
"Input epsilon value is expected to be greater than or equal to 0\n")
249 REQUIRE(data,
"Data required for training\n")
258 " number of vectors in data (presently %d)",
m_weights.
vlen,num_vectors)
270 "be same as number of features in data (presently %d)",
m_nominal.
vlen,num_features)
274 SG_WARNING(
"Feature types are not specified. All features are considered as continuous in training")
302 for(int32_t i=0; i<sorted_indices.
num_cols; i++)
303 for(int32_t j=0; j<sorted_indices.
num_rows; j++)
304 sorted_indices(j,i)=j;
309 map_sorted_feats=map_data.transpose();
311 #pragma omp parallel for
312 for(int32_t i=0; i<sorted_feats.
num_cols; i++)
319 REQUIRE(labels,
"labels have to be supplied\n");
320 REQUIRE(data,
"data matrix has to be supplied\n");
334 for (int32_t i=0;i<labels_vec.
vlen;i++)
335 sum+=labels_vec[i]*weights[i];
340 node->
data.node_label=sum/tot;
341 node->
data.total_weight=tot;
350 int32_t
max=weights[0];
353 int32_t c=weights[0];
354 for (int32_t i=1;i<lab.
vlen;i++)
356 if (lab[i]==lab[i-1])
378 node->
data.node_label=lab[maxi];
381 node->
data.total_weight=weights.
sum(weights);
382 node->
data.weight_minus_node=node->
data.total_weight-
max;
386 SG_ERROR(
"mode should be either PT_MULTICLASS or PT_REGRESSION\n");
393 node->
data.num_leaves=1;
394 node->
data.weight_minus_branch=node->
data.weight_minus_node;
401 node->
data.num_leaves=1;
402 node->
data.weight_minus_branch=node->
data.weight_minus_node;
413 int32_t num_missing_final=0;
416 int32_t best_attribute;
427 best_attribute=
compute_best_attribute(
m_sorted_features,weights,labels,left,right,left_final,num_missing_final,c_left,c_right,0,indices);
430 best_attribute=
compute_best_attribute(mat,weights,labels,left,right,left_final,num_missing_final,c_left,c_right);
432 if (best_attribute==-1)
434 node->
data.num_leaves=1;
435 node->
data.weight_minus_branch=node->
data.weight_minus_node;
444 if (num_missing_final>0)
448 for (int32_t i=0;i<num_vecs;i++)
450 if (mat(best_attribute,i)!=
MISSING)
451 is_left_final[ilf++]=left_final[i];
457 int32_t count_left=0;
458 for (int32_t c=0;c<num_vecs;c++)
459 count_left=(left_final[c])?count_left+1:count_left;
467 for (int32_t c=0;c<num_vecs;c++)
472 weightsl[l++]=weights[c];
477 weightsr[r++]=weights[c];
496 node->
data.attribute_id=best_attribute;
497 node->
left(left_child);
498 node->
right(right_child);
499 left_child->
data.transit_into_values=left_transit;
500 right_child->
data.transit_into_values=right_transit;
501 node->
data.num_leaves=left_child->
data.num_leaves+right_child->
data.num_leaves;
502 node->
data.weight_minus_branch=left_child->
data.weight_minus_branch+right_child->
data.weight_minus_branch;
515 ulabels[0]=labels_vec[sidx[0]];
518 for (int32_t i=1;i<sidx.vlen;i++)
520 if (labels_vec[sidx[i]]<=labels_vec[sidx[start]]+delta)
524 ulabels[n_ulabels]=labels_vec[sidx[i]];
533 int32_t &count_right, int32_t subset_size,
const SGVector<index_t>& active_indices)
555 total_wclasses.
zero();
558 for (int32_t i=0;i<num_vecs;i++)
560 for (int32_t j=0;j<n_ulabels;j++)
562 if (
CMath::abs(labels_vec[i]-ulabels[j])<=delta)
565 total_wclasses[j]+=weights[i];
575 num_feats=subset_size;
580 int32_t best_attribute=-1;
585 count_indices.
zero();
592 for(int32_t j=0;j<active_indices.
size();j++)
594 if (indices_mask[active_indices[j]]>=0)
595 dupes[indices_mask[active_indices[j]]]=j;
597 indices_mask[active_indices[j]]=j;
598 count_indices[active_indices[j]]++;
602 for (int32_t i=0;i<num_feats;i++)
614 if (indices_mask[sorted_indices[j]]>=0)
616 int32_t count_index = count_indices[sorted_indices[j]];
619 feats[count]=temp_col[j];
620 sorted_args[count]=indices_mask[sorted_indices[j]];
631 for (int32_t j=0;j<num_vecs;j++)
632 feats[j]=mat(idx[i],j);
638 int32_t n_nm_vecs=feats.
vlen;
640 while (feats[n_nm_vecs-1]==
MISSING)
642 total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]-=weights[sorted_args[n_nm_vecs-1]];
647 if (feats[n_nm_vecs-1]<=feats[0]+
EQ_DELTA)
658 for (int32_t j=1;j<n_nm_vecs;j++)
660 if (feats[j]==feats[j-1])
663 simple_feats[j]=(++c);
669 for (int32_t j=1;j<n_nm_vecs;j++)
671 if (feats[j]==feats[j-1])
674 ufeats[++u]=feats[j];
679 for (int32_t k=1;k<num_cases;k++)
694 for (int32_t p=0;p<c+1;p++)
698 for (int32_t j=0;j<n_nm_vecs;j++)
700 is_left[sorted_args[j]]=feats_left[simple_feats[j]];
701 if (is_left[sorted_args[j]])
702 wleft[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
704 wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
706 for (int32_t j=n_nm_vecs-1;j>=0;j--)
709 is_left[j]=is_left[dupes[j]];
714 g=
gain(wleft,wright,total_wclasses);
716 g=
gain(wleft,wright,total_wclasses,ulabels);
718 SG_ERROR(
"Undefined problem statement\n");
722 best_attribute=idx[i];
725 num_missing_final=num_vecs-n_nm_vecs;
728 for (int32_t l=0;l<c+1;l++)
729 count_left=(feats_left[l])?count_left+1:count_left;
731 count_right=c+1-count_left;
735 for (int32_t w=0;w<c+1;w++)
740 right[r++]=ufeats[w];
750 left_wclasses.
zero();
755 right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]];
756 left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]];
757 for (int32_t j=1;j<n_nm_vecs;j++)
761 right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
762 left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
768 g=
gain(left_wclasses,right_wclasses,total_wclasses);
770 g=
gain(left_wclasses,right_wclasses,total_wclasses,ulabels);
772 SG_ERROR(
"Undefined problem statement\n");
777 best_attribute=idx[i];
779 num_missing_final=num_vecs-n_nm_vecs;
785 right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
786 left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
791 while (n_nm_vecs<feats.
vlen)
793 total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]+=weights[sorted_args[n_nm_vecs-1]];
798 if (best_attribute==-1)
803 left[0]=best_threshold;
804 right[0]=best_threshold;
814 if (indices_mask[sorted_indices[i]]>=0)
816 is_left_final[indices_mask[sorted_indices[i]]]=(temp_vec[i]<=best_threshold);
822 for (int32_t i=num_vecs-1;i>=0;i--)
825 is_left_final[i]=is_left_final[dupes[i]];
831 for (int32_t i=0;i<num_vecs;i++)
832 is_left_final[i]=(mat(best_attribute,i)<=best_threshold);
836 return best_attribute;
928 int32_t num_unique=feats.
unique(feats.vector,feats.vlen);
932 for (int32_t j=0;j<num_unique-1;j++)
953 lambda=(p-(1-numer/denom))/p;
955 lambda=(p-(1-numerc/denom))/p;
985 int32_t num_unique=feats.
unique(feats.vector,feats.vlen);
988 int32_t num_cases=
CMath::pow(2,(num_unique-1));
989 for (int32_t j=1;j<num_cases;j++)
992 for (int32_t k=0;k<num_unique;k++)
998 for (int32_t q=0;q<num_unique;q++)
1000 if (feats[q]==m(attr,intersect_vecs->
get_element(k)))
1002 intersect_vecs_left[k]=feats_left[q];
1013 if (intersect_vecs_left[k]==is_left[intersect_vecs->
get_element(k)])
1022 lambda=(p-(1-numer/denom))/p;
1024 lambda=(p-(1-numerc/denom))/p;
1034 for (int32_t q=0;q<num_unique;q++)
1036 if (feats[q]==m(attr,missing_vecs->
get_element(k)))
1039 is_left[missing_vecs->
get_element(k)]=feats_left[q];
1041 is_left[missing_vecs->
get_element(k)]=!feats_left[q];
1062 return lsd_n-(lsd_l*(total_lweight/total_weight))-(lsd_r*(total_rweight/total_weight));
1074 return gini_n-(gini_l*(total_lweight/total_weight))-(gini_r*(total_rweight/total_weight));
1080 total_weight=map_weighted_lab_classes.sum();
1081 float64_t gini=map_weighted_lab_classes.dot(map_weighted_lab_classes);
1083 gini=1.0-(gini/(total_weight*total_weight));
1092 float64_t mean=map_weights.dot(map_feats);
1093 total_weight=map_weights.sum();
1097 for (int32_t i=0;i<weights.
vlen;i++)
1098 dev+=weights[i]*(feats[i]-mean)*(feats[i]-mean);
1100 return dev/total_weight;
1106 REQUIRE(num_vecs>0,
"No data provided in apply\n");
1109 for (int32_t i=0;i<num_vecs;i++)
1116 while(node->
data.num_leaves!=1)
1124 for (int32_t k=0;k<comp.
vlen;k++)
1126 if (comp[k]==sample[node->
data.attribute_id])
1147 if (sample[node->
data.attribute_id]<=leftchild->
data.transit_into_values[0])
1163 labels[i]=node->
data.node_label;
1182 SG_ERROR(
"mode should be either PT_MULTICLASS or PT_REGRESSION\n");
1200 for (int32_t i=0;i<folds;i++)
1205 for (int32_t j=0;j<num_vecs;j++)
1215 SG_ERROR(
"Unfortunately you have reached the very low probability event where atleast one of "
1216 "the subsets in cross-validation is not represented at all. Please re-run.")
1250 if (jth_element!=NULL)
1251 current_root=
dynamic_cast<bnode_t*
>(jth_element);
1253 SG_ERROR(
"%d element is NULL which should not be",j);
1274 int32_t min_index=-1;
1286 for (int32_t j=0;j<folds;j++)
1289 for (int32_t k=base;k<num_alphak[j]+base-1;k++)
1302 base+=num_alphak[j];
1315 best_tree_root=
dynamic_cast<bnode_t*
>(element);
1317 SG_ERROR(
"%d element is NULL which should not be",min_index);
1329 REQUIRE(labels,
"input labels cannot be NULL");
1330 REQUIRE(reference,
"reference labels cannot be NULL")
1341 for (int32_t i=0;i<weights.
vlen;i++)
1352 for (int32_t i=0;i<weights.
vlen;i++)
1367 REQUIRE(tree,
"Tree not provided for pruning.\n");
1381 t1_root=
dynamic_cast<bnode_t*
>(t1root);
1383 SG_ERROR(
"t1_root is NULL. This is not expected\n")
1387 while(t1_root->
data.num_leaves>1)
1395 t2_root=
dynamic_cast<bnode_t*
>(t2root);
1397 SG_ERROR(
"t1_root is NULL. This is not expected\n")
1417 if (node->
data.num_leaves!=1)
1425 weak_links[2]=(node->
data.weight_minus_node-node->
data.weight_minus_branch)/node->
data.total_weight;
1426 weak_links[2]/=(node->
data.num_leaves-1.0);
1438 if (node->
data.num_leaves==1)
1442 g/=(node->
data.num_leaves-1.0);
1445 node->
data.num_leaves=1;
1446 node->
data.weight_minus_branch=node->
data.weight_minus_node;
1458 node->
data.num_leaves=left->
data.num_leaves+right->
data.num_leaves;
1459 node->
data.weight_minus_branch=left->
data.weight_minus_branch+right->
data.weight_minus_branch;
1468 if (node->
data.num_leaves!=1)
1476 node->
data.num_leaves=left->
data.num_leaves+right->
data.num_leaves;
1477 node->
data.weight_minus_branch=left->
data.weight_minus_branch+right->
data.weight_minus_branch;
1478 if (node->
data.weight_minus_node==node->
data.weight_minus_branch)
1480 node->
data.num_leaves=1;
CLabels * apply_from_current_node(CDenseFeatures< float64_t > *feats, bnode_t *current)
virtual int32_t compute_best_attribute(const SGMatrix< float64_t > &mat, const SGVector< float64_t > &weights, CLabels *labels, SGVector< float64_t > &left, SGVector< float64_t > &right, SGVector< bool > &is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right, int32_t subset_size=0, const SGVector< int32_t > &active_indices=SGVector< index_t >())
void range_fill(T start=0)
static void permute(SGVector< T > v, CRandom *rand=NULL)
The node of the tree structure forming a TreeMachine The node contains pointer to its parent and poin...
int32_t get_max_depth() const
static void fill_vector(T *vec, int32_t len, T value)
virtual ELabelType get_label_type() const =0
void set_weights(SGVector< float64_t > w)
Real Labels are real-valued labels.
SGVector< bool > get_feature_types() const
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_min_node_size() const
void set_machine_problem_type(EProblemType mode)
int32_t get_num_folds() const
CDynamicObjectArray * prune_tree(CTreeMachine< CARTreeNodeData > *tree)
The class Labels models labels, i.e. class assignments of objects.
SGMatrix< index_t > m_sorted_indices
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
static void qsort_index(T1 *output, T2 *index, uint32_t size)
multi-class labels 0,1,...
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
float64_t find_weakest_alpha(bnode_t *node)
void form_t1(bnode_t *node)
virtual bool is_label_valid(CLabels *lab) const
static const float64_t EQ_DELTA
CTreeMachineNode< CARTreeNodeData > * get_root()
SGVector< bool > surrogate_split(SGMatrix< float64_t > data, SGVector< float64_t > weights, SGVector< bool > nm_left, int32_t attr)
virtual bool train_machine(CFeatures *data=NULL)
float64_t get_label(int32_t idx)
float64_t m_label_epsilon
virtual void set_labels(CLabels *lab)
void set_root(CTreeMachineNode< CARTreeNodeData > *root)
class to add subset support to another class. A CSubsetStackStack instance should be added and wrappe...
static void qsort(T *output, int32_t size)
Multiclass Labels for multi-class classification.
static bool fequals(const T &a, const T &b, const float64_t eps, bool tolerant=false)
virtual void set_children(CDynamicObjectArray *children)
CSubset * get_last_subset() const
virtual CSubsetStack * get_subset_stack()
void clear_feature_types()
float64_t gain(SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
Class SGObject is the base class of all shogun objects.
float64_t least_squares_deviation(const SGVector< float64_t > &labels, const SGVector< float64_t > &weights, float64_t &total_weight)
bool set_element(T e, int32_t idx1, int32_t idx2=0, int32_t idx3=0)
virtual int32_t get_num_vectors() const
void set_min_node_size(int32_t nsize)
void right(CBinaryTreeMachineNode *r)
void set_sorted_features(SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
void handle_missing_vecs_for_continuous_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
void set_num_folds(int32_t folds)
float64_t gini_impurity_index(const SGVector< float64_t > &weighted_lab_classes, float64_t &total_weight)
static SGVector< index_t > argsort(SGVector< T > vector)
virtual void remove_subset()
SGVector< float64_t > get_weights() const
CTreeMachine * clone_tree()
virtual void add_subset(SGVector< index_t > subset)
static T sum(T *vec, int32_t len)
Return sum(vec)
virtual EFeatureClass get_feature_class() const =0
T * get_column_vector(index_t col) const
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
SGMatrix< float64_t > m_sorted_features
virtual CBinaryTreeMachineNode< CARTreeNodeData > * CARTtrain(CFeatures *data, SGVector< float64_t > weights, CLabels *labels, int32_t level)
void set_max_depth(int32_t depth)
void handle_missing_vecs_for_nominal_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
structure to store data of a node of CART. This can be used as a template type in TreeMachineNode cla...
void pre_sort_features(CFeatures *data, SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
all of classes and functions are contained in the shogun namespace
virtual void remove_subset()
void set_feature_types(SGVector< bool > ft)
CBinaryTreeMachineNode< CARTreeNodeData > bnode_t
The class Features is the base class of all feature objects.
SGVector< bool > m_nominal
float64_t compute_error(CLabels *labels, CLabels *reference, SGVector< float64_t > weights)
void cut_weakest_link(bnode_t *node, float64_t alpha)
static const float64_t MIN_SPLIT_GAIN
SGVector< T > clone() const
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
virtual bool has_subsets() const
int32_t get_num_elements() const
void prune_using_test_dataset(CDenseFeatures< float64_t > *feats, CLabels *gnd_truth, SGVector< float64_t > weights=SGVector< float64_t >())
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
void push_back(CSGObject *e)
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
SGVector< float64_t > m_weights
static float32_t sqrt(float32_t x)
Dense integer or floating point labels.
CDynamicArray< float64_t > * m_alphas
void left(CBinaryTreeMachineNode *l)
static int32_t unique(T *output, int32_t size)
static int32_t pow(bool x, int32_t n)
static const float64_t MAX_REAL_NUMBER
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
void set_const(T const_elem)
virtual void add_subset(SGVector< index_t > subset)
SGVector< float64_t > get_unique_labels(SGVector< float64_t > labels_vec, int32_t &n_ulabels)
void set_label_epsilon(float64_t epsilon)
void prune_by_cross_validation(CDenseFeatures< float64_t > *data, int32_t folds)
static void random_vector(T *vec, int32_t len, T min_value, T max_value)
static const float64_t MISSING