52 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
65 prune_tree_from_current_node(validation_data,validation_labels,current,epsilon);
111 REQUIRE(data,
"Data required for training\n")
119 REQUIRE(m_weights.
vlen==num_vectors,
"Length of weights vector (currently %d) should be same as"
120 " number of vectors in data (presently %d)",m_weights.
vlen,num_vectors)
131 REQUIRE(m_nominal.
vlen==num_features,
"Length of m_nominal vector (currently %d) should "
132 "be same as number of features in data (presently %d)",m_nominal.
vlen,num_features)
136 SG_WARNING(
"Feature types are not specified. All features are considered as continuous in training")
144 set_root(C45train(data, m_weights, dynamic_cast<CMulticlassLabels*>(
m_labels), feature_ids, 0));
152 REQUIRE(data,
"data matrix cannot be NULL\n");
153 REQUIRE(class_labels,
"class labels cannot be NULL\n");
162 int32_t most_label=labels[0];
163 int32_t most_weight=weights[0];
164 int32_t weight=weights[0];
166 for (int32_t i=1; i<labels.
vlen; i++)
168 if (labels[i]==labels[i-1])
172 else if (weight>most_weight)
175 most_label=labels[i-1];
184 if (weight>most_weight)
187 most_label=labels[labels.
vlen-1];
190 node->
data.class_label=most_label;
191 node->data.total_weight=weights.
sum(weights.
vector,weights.
vlen);
192 node->data.weight_minus=0.0;
193 for (int32_t i=0;i<labels.
vlen;i++)
195 if (class_labels->
get_label(i)!=most_label)
196 node->data.weight_minus+=weights[i];
204 if (feature_id_vector.
vlen==0)
209 for (int32_t i=1;i<num_vecs;i++)
229 int32_t best_feature_index=-1;
233 if (m_nominal[feature_id_vector[i]])
235 float64_t gain=informational_gain_attribute(i,feats,weights,class_labels);
239 best_feature_index=i;
246 for (int32_t k=0; k<num_vecs; k++)
251 max_value=feature_values[k];
254 for (int32_t k=0;k<num_vecs;k++)
261 for (int32_t l=0;l<num_vecs;l++)
265 else if (feature_values[l]<=z)
266 temp_feat_mat(0,l)=0.;
268 temp_feat_mat(0,l)=1.;
272 float64_t gain=informational_gain_attribute(0,temp_feats,weights,class_labels);
277 best_feature_index=i;
290 if (!m_nominal[feature_id_vector[best_feature_index]])
293 for(int32_t p=0;p<num_vecs;p++)
299 if (feature_cache[p]<=threshold)
308 for (int32_t i=0; i<num_vecs; i++)
312 int32_t num_missing=0;
314 for (int32_t j=0;j<num_vecs;j++)
319 weight_missing+=weights[j];
325 for (int32_t j=0;j<num_vecs;j++)
328 best_features_unique[index++]=best_feature_values[j];
331 int32_t uniques_num=best_features_unique.unique(best_features_unique.vector,best_features_unique.vlen);
334 for (int32_t i=0; i<uniques_num; i++)
338 float64_t active_feature_value=best_features_unique[i];
340 for (int32_t j=0; j<num_vecs; j++)
342 if (active_feature_value==best_feature_values[j] ||
CMath::fequals(best_feature_values[j],
MISSING,0))
352 for (int32_t j=0; j<num_vecs; j++)
355 if (active_feature_value==sample[best_feature_index] ||
CMath::fequals(sample[best_feature_index],
MISSING,0))
358 for (int32_t k=0; k<sample.
size(); k++)
360 if (k!=best_feature_index)
361 mat(++idx, cnt) = sample[k];
364 new_labels_vector[cnt]=class_labels->
get_labels()[j];
366 new_weights[cnt]=weights[j];
375 float64_t numer=new_weights.
sum(new_weights.vector,new_weights.vlen);
376 float64_t rec_weight=numer/(node->data.total_weight-weight_missing);
378 for (int32_t j=0;j<num_vecs;j++)
381 new_weights[cnt++]=rec_weight;
382 else if (best_feature_values[j]==active_feature_value)
389 for (int32_t j=0;j<feature_id_vector.
vlen;j++)
391 if (j!=best_feature_index)
392 new_feature_id_vector[++cnt]=feature_id_vector[j];
400 node_t* child=C45train(new_data,new_weights,new_class_labels,new_feature_id_vector,level+1);
401 node->data.attribute_id=feature_id_vector[best_feature_index];
402 if (m_nominal[feature_id_vector[best_feature_index]])
403 child->data.transit_if_feature_value=active_feature_value;
405 child->data.transit_if_feature_value=threshold;
407 node->add_child(child);
414 if (!m_nominal[feature_id_vector[best_feature_index]])
417 for(int32_t p=0;p<num_vecs;p++)
428 if (current->data.attribute_id==-1)
434 if (m_nominal[current->data.attribute_id])
442 for (int32_t j=0; j<feature_matrix.
num_cols; j++)
446 if (child_transit==feature_matrix(current->data.attribute_id,j))
457 for (int32_t j=0; j<feature_matrix.
num_cols;j++)
459 float64_t child_transit=child->
data.transit_if_feature_value;
461 if (child_transit==feature_matrix(current->data.attribute_id,j))
472 prune_tree_from_current_node(feats,gnd_truth,child,epsilon);
482 REQUIRE(children->
get_num_elements()==2,
"The chosen attribute in current node is continuous. Expected number of"
483 " children is 2 but current node has %d children.",children->
get_num_elements())
488 int32_t count_left=0;
489 for (int32_t k=0;k<feature_matrix.
num_cols;k++)
491 if (feature_matrix(current->data.attribute_id,k)<=left_child->
data.transit_if_feature_value)
499 for (int32_t k=0;k<feature_matrix.
num_cols;k++)
501 if (feature_matrix(current->data.attribute_id,k)<=left_child->
data.transit_if_feature_value)
513 prune_tree_from_current_node(feats,gnd_truth,left_child,epsilon);
519 if (count_left<feature_matrix.
num_cols)
524 prune_tree_from_current_node(feats,gnd_truth,right_child,epsilon);
535 CMulticlassLabels* predicted_unpruned=apply_multiclass_from_current_node(feats, current);
537 for (int32_t i=0; i<feature_matrix.
num_cols; i++)
538 pruned_labels[i]=current->
data.class_label;
546 if (unpruned_accuracy<pruned_accuracy+epsilon)
549 current->set_children(null_children);
558 float64_t CC45ClassifierTree::informational_gain_attribute(int32_t attr_no,
CFeatures* data,
561 REQUIRE(data,
"Data required for information gain calculation\n")
563 "Dense data required for information gain calculation\n")
567 int32_t num_vecs=feats->get_num_vectors();
572 int32_t num_missing=0;
573 for (int32_t i=0;i<num_vecs;i++)
582 for (int32_t i=0; i<num_vecs; i++)
591 for (int32_t i=0; i<num_vecs; i++)
596 gain_weights[index]=weights[i];
597 label_vector[index++]=class_labels->get_label(i);
601 num_vecs-=num_missing;
608 int32_t uniques_num=attr_val_unique.
unique(attr_val_unique.
vector,attr_val_unique.
vlen);
610 for (int32_t i=0; i<uniques_num; i++)
613 int32_t attr_count=0;
616 for (int32_t j=0; j<num_vecs; j++)
618 if (gain_attribute_values[j]==attr_val_unique[i])
620 weight_count+=gain_weights[j];
629 for (int32_t j=0; j<num_vecs; j++)
631 if (gain_attribute_values[j]==attr_val_unique[i])
633 sub_weights[count]=gain_weights[j];
634 sub_class[count++]=gain_labels->get_label(j);
639 float64_t sub_entropy=entropy(sub_labels,sub_weights);
640 gain += sub_entropy*weight_count/total_weight;
645 float64_t data_entropy=entropy(gain_labels,gain_weights);
646 gain = data_entropy-gain;
650 gain*=(num_vecs-0.f)/(num_vecs+num_missing-0.f);
670 weight_count+=weights[j];
675 log_ratios[i]=weight_count/total_weight;
683 node_t* current,
bool set_certainty)
685 REQUIRE(feats,
"Features should not be NULL")
686 REQUIRE(current, "Current node should not be NULL")
688 int32_t num_vecs=feats->get_num_vectors();
694 for (int32_t i=0; i<num_vecs; i++)
707 if (m_nominal[node->data.attribute_id])
714 child=
dynamic_cast<node_t*
>(el);
716 SG_ERROR(
"%d element of children is NULL\n",j);
718 if (child->data.transit_if_feature_value==sample[node->data.attribute_id])
726 children=node->get_children();
743 left_child=
dynamic_cast<node_t*
>(el);
747 el=children->get_element(1);
750 right_child=dynamic_cast<
node_t*>(el);
754 if (left_child->data.transit_if_feature_value>=sample[node->data.attribute_id])
761 children=node->get_children();
770 children=node->get_children();
779 labels[i]=node->data.class_label;
782 m_certainty[i]=(node->data.total_weight-node->data.weight_minus)/node->data.total_weight;
virtual ~CC45ClassifierTree()
CTreeMachineNode< C45TreeNodeData > node_t
SGVector< float64_t > get_weights() const
void range_fill(T start=0)
void set_feature_types(SGVector< bool > ft)
static void fill_vector(T *vec, int32_t len, T value)
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_num_features() const
virtual int32_t get_num_labels() const
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)
virtual bool train_machine(CFeatures *data=NULL)
SGMatrix< ST > get_feature_matrix()
SGVector< float64_t > get_unique_labels()
static const float64_t MIN_REAL_NUMBER
The class MulticlassAccuracy used to compute accuracy of multiclass classification.
CTreeMachineNode< C45TreeNodeData > * get_root()
void clear_feature_types()
int32_t get_num_elements() const
float64_t get_label(int32_t idx)
SGVector< float64_t > get_labels_copy()
void set_root(CTreeMachineNode< C45TreeNodeData > *root)
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
SGVector< float64_t > get_labels()
static void qsort(T *output, int32_t size)
Multiclass Labels for multi-class classification.
static bool fequals(const T &a, const T &b, const float64_t eps, bool tolerant=false)
float64_t transit_if_feature_value
Class C45ClassifierTree implements the C4.5 algorithm for decision tree learning. The algorithm steps...
Class SGObject is the base class of all shogun objects.
virtual int32_t get_num_vectors() const
static float64_t entropy(float64_t *p, int32_t len)
virtual void remove_subset()
virtual void add_subset(SGVector< index_t > subset)
static T sum(T *vec, int32_t len)
Return sum(vec)
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
void set_weights(SGVector< float64_t > w)
static const float64_t MISSING
all of classes and functions are contained in the shogun namespace
SGVector< bool > get_feature_types() const
virtual void remove_subset()
SGVector< float64_t > get_certainty_vector() const
The class Features is the base class of all feature objects.
structure to store data of a node of C4.5 tree. This can be used as a template type in TreeMachineNod...
static float64_t log(float64_t v)
SGVector< T > clone() const
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
void prune_tree(CDenseFeatures< float64_t > *validation_data, CMulticlassLabels *validation_labels, float64_t epsilon=0.f)
static int32_t unique(T *output, int32_t size)
static const float64_t NOT_A_NUMBER
not a number
virtual void add_subset(SGVector< index_t > subset)