51 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
64 prune_tree_machine(validation_data, validation_labels, current, epsilon);
72 REQUIRE(data,
"Data required for training\n")
79 set_root(id3train(data, dynamic_cast<CMulticlassLabels*>(
m_labels), feature_ids, 0));
95 int32_t most_label = labels[0];
99 for (int32_t i=1; i<labels.
vlen; i++)
101 if (labels[i] == labels[i-1])
105 else if (count>most_num)
108 most_label = labels[i-1];
117 node->
data.class_label = most_label;
120 if (most_num == labels.
vlen)
124 if (feature_id_vector.
vlen == 0)
129 int32_t best_feature_index = -1;
132 float64_t gain = informational_gain_attribute(i,feats,class_labels);
137 best_feature_index = i;
143 for (int32_t i=0; i<num_vecs; i++)
149 for (int32_t i=0; i<best_labels_unique.
vlen; i++)
152 int32_t num_cols = 0;
153 float64_t active_feature_value = best_labels_unique[i];
155 for (int32_t j=0; j<num_vecs; j++)
157 if ( active_feature_value == best_feature_values[j])
166 for (int32_t j=0; j<num_vecs; j++)
169 if (active_feature_value == sample[best_feature_index])
172 for (int32_t k=0; k<sample.
size(); k++)
174 if (k != best_feature_index)
175 mat(++idx, cnt) = sample[k];
178 new_labels_vector[cnt] = class_labels->
get_labels()[j];
186 for (int32_t j=0;j<feature_id_vector.
vlen;j++)
188 if (j!=best_feature_index)
189 new_feature_id_vector[++cnt] = feature_id_vector[j];
195 node_t* child = id3train(new_data, new_class_labels, new_feature_id_vector, level+1);
196 child->data.transit_if_feature_value = active_feature_value;
197 node->data.attribute_id = feature_id_vector[best_feature_index];
198 node->add_child(child);
209 float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no,
CFeatures* data,
212 REQUIRE(data,
"Data required for information gain calculation\n")
214 "Dense data required for information gain calculation\n")
218 int32_t num_vecs = feats->get_num_vectors();
223 for (int32_t i=0; i<num_vecs; i++)
224 attribute_values[i] = (feats->get_feature_vector(i))[attr_no];
229 for (int32_t i=0; i<attr_val_unique.vlen; i++)
232 int32_t attr_count=0;
234 for (int32_t j=0; j<num_vecs; j++)
236 if (attribute_values[j] == attr_val_unique[i])
243 for (int32_t j=0; j<num_vecs; j++)
245 if (attribute_values[j] == attr_val_unique[i])
246 sub_class[count++] = class_labels->
get_label(j);
250 float64_t sub_entropy = entropy(sub_labels);
251 gain += sub_entropy*(attr_count-0.f)/(num_vecs-0.f);
256 float64_t data_entropy = entropy(class_labels);
257 gain = data_entropy-gain;
281 if (log_ratios[i] != 0)
300 for (int32_t j=0; j<feature_matrix.
num_cols; j++)
304 if (child_transit == feature_matrix(current->data.attribute_id,j))
312 for (int32_t j=0; j<feature_matrix.
num_cols;j++)
314 float child_transit = child->
data.transit_if_feature_value;
316 if (child_transit == feature_matrix(current->data.attribute_id,j))
327 prune_tree_machine(feats, gnd_truth, child, epsilon);
337 CMulticlassLabels* predicted_unpruned = apply_multiclass_from_current_node(feats, current);
339 for (int32_t i=0; i<feature_matrix.
num_cols; i++)
340 pruned_labels[i] = current->
data.class_label;
348 if (unpruned_accuracy<pruned_accuracy+epsilon)
351 current->set_children(null_children);
363 REQUIRE(feats,
"Features should not be NULL")
364 REQUIRE(current, "Current node should not be NULL")
366 int32_t num_vecs = feats->get_num_vectors();
370 for (int32_t i=0; i<num_vecs; i++)
385 if (child->data.transit_if_feature_value
386 == sample[node->data.attribute_id])
394 children = node->get_children();
407 labels[i] = node->data.class_label;
CTreeMachineNode< id3TreeNodeData > node_t
void range_fill(T start=0)
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
int32_t get_num_features() const
virtual int32_t get_num_labels() const
virtual float64_t evaluate(CLabels *predicted, CLabels *ground_truth)
virtual ~CID3ClassifierTree()
SGMatrix< ST > get_feature_matrix()
float64_t transit_if_feature_value
SGVector< float64_t > get_unique_labels()
The class MulticlassAccuracy used to compute accuracy of multiclass classification.
CTreeMachineNode< id3TreeNodeData > * get_root()
int32_t get_num_elements() const
float64_t get_label(int32_t idx)
structure to store data of a node of id3 tree. This can be used as a template type in TreeMachineNode...
SGVector< float64_t > get_labels_copy()
void set_root(CTreeMachineNode< id3TreeNodeData > *root)
SGVector< float64_t > get_labels()
static void qsort(T *output, int32_t size)
Multiclass Labels for multi-class classification.
virtual int32_t get_num_vectors() const
static float64_t entropy(float64_t *p, int32_t len)
virtual void remove_subset()
virtual void add_subset(SGVector< index_t > subset)
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
all of classes and functions are contained in the shogun namespace
virtual void remove_subset()
virtual bool train_machine(CFeatures *data=NULL)
The class Features is the base class of all feature objects.
static float64_t log(float64_t v)
CSGObject * get_element(int32_t index) const
Matrix::Scalar max(Matrix m)
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
bool prune_tree(CDenseFeatures< float64_t > *validation_data, CMulticlassLabels *validation_labels, float64_t epsilon=0.f)
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
virtual void add_subset(SGVector< index_t > subset)