77 SG_ERROR(
"label type supplied is not supported\n")
101 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
113 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
140 root=
dynamic_cast<bnode_t*
>(element);
159 root=
dynamic_cast<bnode_t*
>(element);
161 SG_ERROR(
"%d element is NULL\n",min_index);
210 REQUIRE(folds>1,
"Number of folds is expected to be greater than 1. Supplied value is %d\n",folds)
221 REQUIRE(depth>0,
"Max allowed tree depth should be greater than 0. Supplied value is %d\n",depth)
232 REQUIRE(nsize>0,
"Min allowed node size should be greater than 0. Supplied value is %d\n",nsize)
238 REQUIRE(ep>=0,
"Input epsilon value is expected to be greater than or equal to 0\n")
244 REQUIRE(data,
"Data required for training\n")
253 " number of vectors in data (presently %d)",
m_weights.
vlen,num_vectors)
265 "be same as number of features in data (presently %d)",
m_nominal.
vlen,num_features)
269 SG_WARNING(
"Feature types are not specified. All features are considered as continuous in training")
287 REQUIRE(labels,
"labels have to be supplied\n");
288 REQUIRE(data,
"data matrix has to be supplied\n");
302 for (int32_t i=0;i<labels_vec.
vlen;i++)
303 sum+=labels_vec[i]*weights[i];
308 node->
data.node_label=sum/tot;
309 node->
data.total_weight=tot;
318 int32_t
max=weights[0];
321 int32_t c=weights[0];
322 for (int32_t i=1;i<lab.
vlen;i++)
324 if (lab[i]==lab[i-1])
346 node->
data.node_label=lab[maxi];
349 node->
data.total_weight=weights.
sum(weights);
350 node->
data.weight_minus_node=node->
data.total_weight-
max;
354 SG_ERROR(
"mode should be either PT_MULTICLASS or PT_REGRESSION\n");
361 node->
data.num_leaves=1;
362 node->
data.weight_minus_branch=node->
data.weight_minus_node;
369 node->
data.num_leaves=1;
370 node->
data.weight_minus_branch=node->
data.weight_minus_node;
381 int32_t num_missing_final=0;
385 int32_t best_attribute=
compute_best_attribute(mat,weights,labels_vec,left,right,left_final,num_missing_final,c_left,c_right);
387 if (best_attribute==-1)
389 node->
data.num_leaves=1;
390 node->
data.weight_minus_branch=node->
data.weight_minus_node;
399 if (num_missing_final>0)
403 for (int32_t i=0;i<num_vecs;i++)
405 if (mat(best_attribute,i)!=
MISSING)
406 is_left_final[ilf++]=left_final[i];
412 int32_t count_left=0;
413 for (int32_t c=0;c<num_vecs;c++)
414 count_left=(left_final[c])?count_left+1:count_left;
422 for (int32_t c=0;c<num_vecs;c++)
427 weightsl[l++]=weights[c];
432 weightsr[r++]=weights[c];
451 node->
data.attribute_id=best_attribute;
452 node->
left(left_child);
453 node->
right(right_child);
454 left_child->
data.transit_into_values=left_transit;
455 right_child->
data.transit_into_values=right_transit;
456 node->
data.num_leaves=left_child->
data.num_leaves+right_child->
data.num_leaves;
457 node->
data.weight_minus_branch=left_child->
data.weight_minus_branch+right_child->
data.weight_minus_branch;
470 ulabels[0]=labels_vec[sidx[0]];
473 for (int32_t i=1;i<sidx.vlen;i++)
475 if (labels_vec[sidx[i]]<=labels_vec[sidx[start]]+delta)
479 ulabels[n_ulabels]=labels_vec[sidx[i]];
488 int32_t &count_right)
505 total_wclasses.
zero();
508 for (int32_t i=0;i<num_vecs;i++)
510 for (int32_t j=0;j<n_ulabels;j++)
512 if (
CMath::abs(labels_vec[i]-ulabels[j])<=delta)
515 total_wclasses[j]+=weights[i];
522 int32_t best_attribute=-1;
524 for (int32_t i=0;i<num_feats;i++)
527 for (int32_t j=0;j<num_vecs;j++)
534 int32_t n_nm_vecs=feats.
vlen;
535 while (feats[sorted_args[n_nm_vecs-1]]==
MISSING)
537 total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]-=weights[sorted_args[n_nm_vecs-1]];
542 if (feats[sorted_args[n_nm_vecs-1]]<=feats[sorted_args[0]]+
EQ_DELTA)
551 simple_feats[sorted_args[0]]=0;
553 for (int32_t j=1;j<n_nm_vecs;j++)
555 if (feats[sorted_args[j]]==feats[sorted_args[j-1]])
556 simple_feats[sorted_args[j]]=c;
558 simple_feats[sorted_args[j]]=(++c);
562 ufeats[0]=feats[sorted_args[0]];
564 for (int32_t j=1;j<n_nm_vecs;j++)
566 if (feats[sorted_args[j]]==feats[sorted_args[j-1]])
569 ufeats[++u]=feats[sorted_args[j]];
574 for (int32_t k=1;k<num_cases;k++)
589 for (int32_t p=0;p<c+1;p++)
593 for (int32_t j=0;j<n_nm_vecs;j++)
595 is_left[sorted_args[j]]=feats_left[simple_feats[sorted_args[j]]];
596 if (is_left[sorted_args[j]])
597 wleft[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
599 wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
604 g=
gain(wleft,wright,total_wclasses);
606 g=
gain(wleft,wright,total_wclasses,ulabels);
608 SG_ERROR(
"Undefined problem statement\n");
615 num_missing_final=num_vecs-n_nm_vecs;
618 for (int32_t l=0;l<c+1;l++)
619 count_left=(feats_left[l])?count_left+1:count_left;
621 count_right=c+1-count_left;
625 for (int32_t w=0;w<c+1;w++)
630 right[r++]=ufeats[w];
640 left_wclasses.
zero();
645 right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]];
646 left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]];
647 for (int32_t j=1;j<n_nm_vecs;j++)
649 if (feats[sorted_args[j]]<=z+
EQ_DELTA)
651 right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
652 left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
659 g=
gain(left_wclasses,right_wclasses,total_wclasses);
661 g=
gain(left_wclasses,right_wclasses,total_wclasses,ulabels);
663 SG_ERROR(
"Undefined problem statement\n");
670 num_missing_final=num_vecs-n_nm_vecs;
673 z=feats[sorted_args[j]];
674 if (feats[sorted_args[n_nm_vecs-1]]<=z+
EQ_DELTA)
677 right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
678 left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
683 while (n_nm_vecs<feats.
vlen)
685 total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]+=weights[sorted_args[n_nm_vecs-1]];
690 if (best_attribute==-1)
695 left[0]=best_threshold;
696 right[0]=best_threshold;
699 for (int32_t i=0;i<num_vecs;i++)
700 is_left_final[i]=(mat(best_attribute,i)<=best_threshold);
703 return best_attribute;
795 int32_t num_unique=feats.
unique(feats.vector,feats.vlen);
799 for (int32_t j=0;j<num_unique-1;j++)
820 lambda=(p-(1-numer/denom))/p;
822 lambda=(p-(1-numerc/denom))/p;
852 int32_t num_unique=feats.
unique(feats.vector,feats.vlen);
855 int32_t num_cases=
CMath::pow(2,(num_unique-1));
856 for (int32_t j=1;j<num_cases;j++)
859 for (int32_t k=0;k<num_unique;k++)
865 for (int32_t q=0;q<num_unique;q++)
867 if (feats[q]==m(attr,intersect_vecs->
get_element(k)))
869 intersect_vecs_left[k]=feats_left[q];
880 if (intersect_vecs_left[k]==is_left[intersect_vecs->
get_element(k)])
889 lambda=(p-(1-numer/denom))/p;
891 lambda=(p-(1-numerc/denom))/p;
901 for (int32_t q=0;q<num_unique;q++)
903 if (feats[q]==m(attr,missing_vecs->
get_element(k)))
906 is_left[missing_vecs->
get_element(k)]=feats_left[q];
908 is_left[missing_vecs->
get_element(k)]=!feats_left[q];
929 return lsd_n-(lsd_l*(total_lweight/total_weight))-(lsd_r*(total_rweight/total_weight));
941 return gini_n-(gini_l*(total_lweight/total_weight))-(gini_r*(total_rweight/total_weight));
948 for (int32_t i=0;i<weighted_lab_classes.
vlen;i++)
950 total_weight+=weighted_lab_classes[i];
951 gini+=weighted_lab_classes[i]*weighted_lab_classes[i];
954 gini=1.0-(gini/(total_weight*total_weight));
962 for (int32_t i=0;i<weights.
vlen;i++)
964 mean+=feats[i]*weights[i];
965 total_weight+=weights[i];
970 for (int32_t i=0;i<weights.
vlen;i++)
971 dev+=weights[i]*(feats[i]-mean)*(feats[i]-mean);
973 return dev/total_weight;
980 for (int32_t i=0;i<num_vecs;i++)
987 while(node->
data.num_leaves!=1)
995 for (int32_t k=0;k<comp.
vlen;k++)
997 if (comp[k]==sample[node->
data.attribute_id])
1018 if (sample[node->
data.attribute_id]<=leftchild->
data.transit_into_values[0])
1034 labels[i]=node->
data.node_label;
1053 SG_ERROR(
"mode should be either PT_MULTICLASS or PT_REGRESSION\n");
1071 for (int32_t i=0;i<folds;i++)
1076 for (int32_t j=0;j<num_vecs;j++)
1086 SG_ERROR(
"Unfortunately you have reached the very low probability event where atleast one of "
1087 "the subsets in cross-validation is not represented at all. Please re-run.")
1121 if (jth_element!=NULL)
1122 current_root=
dynamic_cast<bnode_t*
>(jth_element);
1124 SG_ERROR(
"%d element is NULL which should not be",j);
1145 int32_t min_index=-1;
1157 for (int32_t j=0;j<folds;j++)
1160 for (int32_t k=base;k<num_alphak[j]+base-1;k++)
1173 base+=num_alphak[j];
1186 best_tree_root=
dynamic_cast<bnode_t*
>(element);
1188 SG_ERROR(
"%d element is NULL which should not be",min_index);
1200 REQUIRE(labels,
"input labels cannot be NULL");
1201 REQUIRE(reference,
"reference labels cannot be NULL")
1212 for (int32_t i=0;i<weights.
vlen;i++)
1223 for (int32_t i=0;i<weights.
vlen;i++)
1250 t1_root=
dynamic_cast<bnode_t*
>(t1root);
1252 SG_ERROR(
"t1_root is NULL. This is not expected\n")
1256 while(t1_root->
data.num_leaves>1)
1264 t2_root=
dynamic_cast<bnode_t*
>(t2root);
1266 SG_ERROR(
"t1_root is NULL. This is not expected\n")
1286 if (node->
data.num_leaves!=1)
1294 weak_links[2]=(node->
data.weight_minus_node-node->
data.weight_minus_branch)/node->
data.total_weight;
1295 weak_links[2]/=(node->
data.num_leaves-1.0);
1307 if (node->
data.num_leaves==1)
1311 g/=(node->
data.num_leaves-1.0);
1314 node->
data.num_leaves=1;
1315 node->
data.weight_minus_branch=node->
data.weight_minus_node;
1327 node->
data.num_leaves=left->
data.num_leaves+right->
data.num_leaves;
1328 node->
data.weight_minus_branch=left->
data.weight_minus_branch+right->
data.weight_minus_branch;
1337 if (node->
data.num_leaves!=1)
1345 node->
data.num_leaves=left->
data.num_leaves+right->
data.num_leaves;
1346 node->
data.weight_minus_branch=left->
data.weight_minus_branch+right->
data.weight_minus_branch;
1347 if (node->
data.weight_minus_node==node->
data.weight_minus_branch)
1349 node->
data.num_leaves=1;
CLabels * apply_from_current_node(CDenseFeatures< float64_t > *feats, bnode_t *current)
The node of the tree structure forming a TreeMachine The node contains pointer to its parent and poin...
int32_t get_max_depth() const
static void fill_vector(T *vec, int32_t len, T value)
virtual ELabelType get_label_type() const =0
void set_weights(SGVector< float64_t > w)
Real Labels are real-valued labels.
SGVector< bool > get_feature_types() const
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_min_node_size() const
void set_machine_problem_type(EProblemType mode)
int32_t get_num_folds() const
CDynamicObjectArray * prune_tree(CTreeMachine< CARTreeNodeData > *tree)
The class Labels models labels, i.e. class assignments of objects.
real valued labels (e.g. for regression, classifier outputs)
multi-class labels 0,1,...
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
float64_t find_weakest_alpha(bnode_t *node)
float64_t least_squares_deviation(SGVector< float64_t > labels, SGVector< float64_t > weights, float64_t &total_weight)
void form_t1(bnode_t *node)
virtual bool is_label_valid(CLabels *lab) const
static const float64_t EQ_DELTA
CTreeMachineNode< CARTreeNodeData > * get_root()
SGVector< bool > surrogate_split(SGMatrix< float64_t > data, SGVector< float64_t > weights, SGVector< bool > nm_left, int32_t attr)
virtual bool train_machine(CFeatures *data=NULL)
float64_t get_label(int32_t idx)
float64_t m_label_epsilon
virtual void set_labels(CLabels *lab)
void set_root(CTreeMachineNode< CARTreeNodeData > *root)
static void qsort(T *output, int32_t size)
Multiclass Labels for multi-class classification.
static bool fequals(const T &a, const T &b, const float64_t eps, bool tolerant=false)
virtual void set_children(CDynamicObjectArray *children)
float64_t gini_impurity_index(SGVector< float64_t > weighted_lab_classes, float64_t &total_weight)
void clear_feature_types()
float64_t gain(SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
Class SGObject is the base class of all shogun objects.
bool set_element(T e, int32_t idx1, int32_t idx2=0, int32_t idx3=0)
virtual int32_t get_num_vectors() const
void set_min_node_size(int32_t nsize)
void right(CBinaryTreeMachineNode *r)
void handle_missing_vecs_for_continuous_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
void set_num_folds(int32_t folds)
static SGVector< index_t > argsort(SGVector< T > vector)
virtual void remove_subset()
SGVector< float64_t > get_weights() const
CTreeMachine * clone_tree()
virtual void add_subset(SGVector< index_t > subset)
static T sum(T *vec, int32_t len)
Return sum(vec)
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
virtual CBinaryTreeMachineNode< CARTreeNodeData > * CARTtrain(CFeatures *data, SGVector< float64_t > weights, CLabels *labels, int32_t level)
void set_max_depth(int32_t depth)
void handle_missing_vecs_for_nominal_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
structure to store data of a node of CART. This can be used as a template type in TreeMachineNode cla...
all of classes and functions are contained in the shogun namespace
virtual void remove_subset()
void set_feature_types(SGVector< bool > ft)
CBinaryTreeMachineNode< CARTreeNodeData > bnode_t
virtual int32_t compute_best_attribute(SGMatrix< float64_t > mat, SGVector< float64_t > weights, SGVector< float64_t > labels_vec, SGVector< float64_t > left, SGVector< float64_t > right, SGVector< bool > is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right)
The class Features is the base class of all feature objects.
SGVector< bool > m_nominal
float64_t compute_error(CLabels *labels, CLabels *reference, SGVector< float64_t > weights)
void cut_weakest_link(bnode_t *node, float64_t alpha)
static const float64_t MIN_SPLIT_GAIN
SGVector< T > clone() const
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
int32_t get_num_elements() const
void prune_using_test_dataset(CDenseFeatures< float64_t > *feats, CLabels *gnd_truth, SGVector< float64_t > weights=SGVector< float64_t >())
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
void push_back(CSGObject *e)
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
SGVector< float64_t > m_weights
static float32_t sqrt(float32_t x)
Dense integer or floating point labels.
CDynamicArray< float64_t > * m_alphas
void left(CBinaryTreeMachineNode *l)
static int32_t unique(T *output, int32_t size)
static int32_t pow(bool x, int32_t n)
static const float64_t MAX_REAL_NUMBER
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
virtual void add_subset(SGVector< index_t > subset)
SGVector< float64_t > get_unique_labels(SGVector< float64_t > labels_vec, int32_t &n_ulabels)
void set_label_epsilon(float64_t epsilon)
void prune_by_cross_validation(CDenseFeatures< float64_t > *data, int32_t folds)
static void random_vector(T *vec, int32_t len, T min_value, T max_value)
static const float64_t MISSING