In [1]:

```
import os
SHOGUN_DATA_DIR=os.getenv('SHOGUN_DATA_DIR', '../../../../data')
# training data
train_income=['Low','Medium','Low','High','Low','High','Medium','Medium','High','Low','Medium',
'Medium','High','Low','Medium']
train_age = ['Old','Young','Old','Young','Old','Young','Young','Old','Old','Old','Young','Old',
'Old','Old','Young']
train_education = ['University','College','University','University','University','College','College',
'High School','University','High School','College','High School','University','High School','College']
train_marital = ['Married','Single','Married','Single','Married','Single','Married','Single','Single',
'Married','Married','Single','Single','Married','Married']
train_usage = ['Low','Medium','Low','High','Low','Medium','Medium','Low','High','Low','Medium','Low',
'High','Low','Medium']
# print data
print 'Training Data Table : \n'
print 'Income \t\t Age \t\t Education \t\t Marital Status \t Usage'
for i in xrange(len(train_income)):
print train_income[i]+' \t\t '+train_age[i]+' \t\t '+train_education[i]+' \t\t '+train_marital[i]+' \t\t '+train_usage[i]
```

In [2]:

```
from modshogun import ID3ClassifierTree, RealFeatures, MulticlassLabels
from numpy import array, concatenate
# encoding dictionary
income = {'Low' : 1.0, 'Medium' : 2.0, 'High' : 3.0}
age = {'Young' : 1.0, 'Old' : 2.0}
education = {'High School' : 1.0, 'College' : 2.0, 'University' : 3.0}
marital_status = {'Married' : 1.0, 'Single' : 2.0}
usage = {'Low' : 1.0, 'Medium' : 2.0, 'High' : 3.0}
# encode training data
for i in xrange(len(train_income)):
train_income[i] = income[train_income[i]]
train_age[i] = age[train_age[i]]
train_education[i] = education[train_education[i]]
train_marital[i] = marital_status[train_marital[i]]
train_usage[i] = usage[train_usage[i]]
# form Shogun feature matrix
train_data = array([train_income, train_age, train_education, train_marital])
train_feats = RealFeatures(train_data);
# form Shogun multiclass labels
labels = MulticlassLabels(array(train_usage));
```

Next, we learn our decision tree using the features and labels created.

In [3]:

```
# create ID3ClassifierTree object
id3 = ID3ClassifierTree()
# set labels
id3.set_labels(labels)
# learn the tree from training features
is_successful = id3.train(train_feats)
```

In [4]:

```
# test data
test_income = ['Medium','Medium','Low','High','High']
test_age = ['Old','Young','Old','Young','Old']
test_education = ['University','College','High School','University','College']
test_marital = ['Married','Single','Married','Single','Married']
test_usage = ['Low','Medium','Low','High','High']
# tabulate test data
print 'Test Data Table : \n'
print 'Income \t\t Age \t\t Education \t\t Marital Status \t Usage'
for i in xrange(len(test_income)):
print test_income[i]+' \t\t '+test_age[i]+' \t\t '+test_education[i]+' \t\t '+test_marital[i]+' \t\t ?'
```

In [5]:

```
# encode test data
for i in xrange(len(test_income)):
test_income[i] = income[test_income[i]]
test_age[i] = age[test_age[i]]
test_education[i] = education[test_education[i]]
test_marital[i] = marital_status[test_marital[i]]
# bind to shogun features
test_data = array([test_income, test_age, test_education, test_marital])
test_feats = RealFeatures(test_data)
# apply decision tree classification
test_labels = id3.apply_multiclass(test_feats)
```

Finally let us tabulate the results obtained and compare them with our intuitive predictions.

In [6]:

```
output = test_labels.get_labels();
output_labels=[0]*len(output)
# decode back test data for printing
for i in xrange(len(test_income)):
test_income[i]=income.keys()[income.values().index(test_income[i])]
test_age[i]=age.keys()[age.values().index(test_age[i])]
test_education[i]=education.keys()[education.values().index(test_education[i])]
test_marital[i]=marital_status.keys()[marital_status.values().index(test_marital[i])]
output_labels[i]=usage.keys()[usage.values().index(output[i])]
# print output data
print 'Final Test Data Table : \n'
print 'Income \t Age \t Education \t Marital Status \t Usage(predicted)'
for i in xrange(len(test_income)):
print test_income[i]+' \t '+test_age[i]+' \t '+test_education[i]+' \t '+test_marital[i]+' \t\t '+output_labels[i]
```

In [7]:

```
# class attribute
evaluation = {'unacc' : 1.0, 'acc' : 2.0, 'good' : 3.0, 'vgood' : 4.0}
# non-class attributes
buying = {'vhigh' : 1.0, 'high' : 2.0, 'med' : 3.0, 'low' : 4.0}
maint = {'vhigh' : 1.0, 'high' : 2.0, 'med' : 3.0, 'low' : 4.0}
doors = {'2' : 1.0, '3' : 2.0, '4' : 3.0, '5more' : 4.0}
persons = {'2' : 1.0, '4' : 2.0, 'more' : 3.0}
lug_boot = {'small' : 1.0, 'med' : 2.0, 'big' : 3.0}
safety = {'low' : 1.0, 'med' : 2.0, 'high' : 3.0}
```

Next, let us read the file and form Shogun features and labels.

In [8]:

```
f = open( os.path.join(SHOGUN_DATA_DIR, 'uci/car/car.data'), 'r')
features = []
labels = []
# read data from file and encode
for line in f:
words = line.rstrip().split(',')
words[0] = buying[words[0]]
words[1] = maint[words[1]]
words[2] = doors[words[2]]
words[3] = persons[words[3]]
words[4] = lug_boot[words[4]]
words[5] = safety[words[5]]
words[6] = evaluation[words[6]]
features.append(words[0:6])
labels.append(words[6])
f.close()
```

From the entire dataset, let us choose some test vectors to form our test dataset.

In [9]:

```
from numpy import random, delete
features = array(features)
labels = array(labels)
# number of test vectors
num_test_vectors = 200;
test_indices = random.randint(features.shape[0], size = num_test_vectors)
test_features = features[test_indices]
test_labels = labels[test_indices]
# remove test vectors from training set
features = delete(features,test_indices,0)
labels = delete(labels,test_indices,0)
```

In [10]:

```
# shogun test features and labels
test_feats = RealFeatures(test_features.T)
test_labels = MulticlassLabels(test_labels)
# method for id3 training and
def ID3_routine(features, labels):
# Shogun train features and labels
train_feats = RealFeatures(features.T)
train_lab = MulticlassLabels(labels)
# create ID3ClassifierTree object
id3 = ID3ClassifierTree()
# set labels
id3.set_labels(train_lab)
# learn the tree from training features
id3.train(train_feats)
# apply to test dataset
output = id3.apply_multiclass(test_feats)
return output
output = ID3_routine(features, labels)
```

In [11]:

```
from modshogun import MulticlassAccuracy
# Shogun object for calculating multiclass accuracy
accuracy = MulticlassAccuracy()
print 'Accuracy : ' + str(accuracy.evaluate(output, test_labels))
```

In [12]:

```
# list of error rates for all training dataset sizes
error_rate = []
# number of error rate readings taken for each value of dataset size
num_repetitions = 3
# loop over training dataset size
for i in range(500,1600,200):
indices = random.randint(features.shape[0], size = i)
train_features = features[indices]
train_labels = labels[indices]
average_error = 0
for i in xrange(num_repetitions):
output = ID3_routine(train_features, train_labels)
average_error = average_error + (1-accuracy.evaluate(output, test_labels))
error_rate.append(average_error/num_repetitions)
# plot the error rates
import matplotlib.pyplot as pyplot
% matplotlib inline
from scipy.interpolate import interp1d
from numpy import linspace, arange
fig,axis = pyplot.subplots(1,1)
x = arange(500,1600,200)
f = interp1d(x, error_rate)
xnew = linspace(500,1500,100)
pyplot.plot(x,error_rate,'o',xnew,f(xnew),'-')
pyplot.xlim([400,1600])
pyplot.xlabel('training dataset size')
pyplot.ylabel('Classification Error')
pyplot.title('Decision Tree Performance')
pyplot.show()
```

In [13]:

```
import matplotlib.pyplot as plt
from numpy import ones, zeros, random, concatenate
from modshogun import RealFeatures, MulticlassLabels
% matplotlib inline
def create_toy_classification_dataset(ncat,do_plot):
# create attribute values and labels for class 1
x = ones((1,ncat))
y = 1+random.rand(1,ncat)*4
lab = zeros(ncat)
# add attribute values and labels for class 2
x = concatenate((x,ones((1,ncat))),1)
y = concatenate((y,5+random.rand(1,ncat)*4),1)
lab = concatenate((lab,ones(ncat)))
# add attribute values and labels for class 3
x = concatenate((x,2*ones((1,ncat))),1)
y = concatenate((y,1+random.rand(1,ncat)*8),1)
lab = concatenate((lab,2*ones(ncat)))
# create test data
ntest = 20
x_t = concatenate((ones((1,3*ntest/4)),2*ones((1,ntest/4))),1)
y_t = 1+random.rand(1,ntest)*8
if do_plot:
# plot training data
c = ['r','g','b']
for i in range(3):
plt.scatter(x[0,lab==i],y[0,lab==i],color=c[i],marker='x',s=50)
# plot test data
plt.scatter(x_t[0,:],y_t[0,:],color='k',s=10,alpha=0.8)
plt.xlabel('attribute X')
plt.ylabel('attribute Y')
plt.show()
# form training feature matrix
train_feats = RealFeatures(concatenate((x,y),0))
# from training labels
train_labels = MulticlassLabels(lab)
# from test feature matrix
test_feats = RealFeatures(concatenate((x_t,y_t),0))
return (train_feats,train_labels,test_feats);
train_feats,train_labels,test_feats = create_toy_classification_dataset(20,True)
```

In [14]:

```
from numpy import array
from modshogun import C45ClassifierTree
# steps in C4.5 Tree training bundled together in a python method
def train_tree(feats,types,labels):
# C4.5 Tree object
tree = C45ClassifierTree()
# set labels
tree.set_labels(labels)
# supply attribute types
tree.set_feature_types(types)
# supply training matrix and train
tree.train(feats)
return tree
# specify attribute types X is categorical hence True, Y is continuous hence False
feat_types = array([True,False])
# get back trained tree
C45Tree = train_tree(train_feats,feat_types,train_labels)
```

Now that we have trained the decision tree, we can use it to classify our test vectors.

In [15]:

```
def classify_data(tree,data):
# get classification labels
output = tree.apply_multiclass(data)
# get classification certainty
output_certainty=tree.get_certainty_vector()
return output,output_certainty
out_labels,out_certainty = classify_data(C45Tree,test_feats)
```

In [16]:

```
from numpy import int32
# plot results
def plot_toy_classification_results(train_feats,train_labels,test_feats,test_labels):
train = train_feats.get_feature_matrix()
lab = train_labels.get_labels()
test = test_feats.get_feature_matrix()
out_labels = test_labels.get_labels()
c = ['r','g','b']
for i in range(out_labels.size):
plt.scatter(test[0,i],test[1,i],color=c[int32(out_labels[i])],s=50)
# plot training dataset for visual comparison
for i in range(3):
plt.scatter(train[0,lab==i],train[1,lab==i],color=c[i],marker='x',s=30,alpha=0.7)
plt.show()
plot_toy_classification_results(train_feats,train_labels,test_feats,out_labels)
```

In [17]:

```
import csv
from numpy import array
# dictionary to encode class names to class labels
to_label = {'Iris-setosa' : 0.0, 'Iris-versicolor' : 1.0, 'Iris-virginica' : 2.0}
# read csv file and separate out labels and features
lab = []
feat = []
with open( os.path.join(SHOGUN_DATA_DIR, 'uci/iris/iris.data')) as csvfile:
csvread = csv.reader(csvfile,delimiter=',')
for row in csvread:
feat.append([float(i) for i in row[0:4]])
lab.append(to_label[row[4]])
lab = array(lab)
feat = array(feat).T
```

In [18]:

```
from numpy import int32, random
# no.of vectors in test dataset
ntest = 25
# no. of vectors in train dataset
ntrain = 150-ntest
# randomize the order of vectors
subset = int32(random.permutation(150))
# choose 1st ntrain from randomized set as training vectors
feats_train = feat[:,subset[0:ntrain]]
# form training labels correspondingly
train_labels = lab[subset[0:ntrain]]
# form test features and labels (for accuracy evaluations)
feats_test = feat[:,subset[ntrain:ntrain+ntest]]
test_labels = lab[subset[ntrain:ntrain+ntest]]
```

In [19]:

```
import matplotlib.pyplot as plt
% matplotlib inline
# plot training features
c = ['r', 'g', 'b']
for i in range(3):
plt.scatter(feats_train[2,train_labels==i],feats_train[3,train_labels==i],color=c[i],marker='x')
# plot test data points in black
plt.scatter(feats_test[2,:],feats_test[3,:],color='k',marker='o')
plt.show()
```

First, let us create Shogun features and labels from the given data.

In [20]:

```
from modshogun import RealFeatures, MulticlassLabels
# training data
feats_train = RealFeatures(feats_train)
train_labels = MulticlassLabels(train_labels)
# test data
feats_test = RealFeatures(feats_test)
test_labels = MulticlassLabels(test_labels)
```

In [21]:

```
# randomize the order of vectors
subset = int32(random.permutation(ntrain))
nvalidation = 45
# form training subset and validation subset
train_subset = subset[0:ntrain-nvalidation]
validation_subset = subset[ntrain-nvalidation:ntrain]
```

In [22]:

```
# set attribute types - all continuous
feature_types = array([False, False, False, False])
# remove validation subset before training the tree
feats_train.add_subset(train_subset)
train_labels.add_subset(train_subset)
# train tree
C45Tree = train_tree(feats_train,feature_types,train_labels)
# bring back validation subset
feats_train.remove_subset()
train_labels.remove_subset()
# remove data belonging to training subset
feats_train.add_subset(validation_subset)
train_labels.add_subset(validation_subset)
# prune the tree
C45Tree.prune_tree(feats_train,train_labels)
# bring back training subset
feats_train.remove_subset()
train_labels.remove_subset()
# get results
output, output_certainty = classify_data(C45Tree,feats_test)
```

In [23]:

```
from modshogun import MulticlassAccuracy
# Shogun object for calculating multiclass accuracy
accuracy = MulticlassAccuracy()
print 'Accuracy : ' + str(accuracy.evaluate(output, test_labels))
```

In [24]:

```
# convert MulticlassLabels object to labels vector
output = output.get_labels()
test_labels = test_labels.get_labels()
train_labels = train_labels.get_labels()
# convert RealFeatures object to matrix
feats_test = feats_test.get_feature_matrix()
feats_train = feats_train.get_feature_matrix()
# plot ground truth
for i in range(3):
plt.scatter(feats_test[2,test_labels==i],feats_test[3,test_labels==i],color=c[i],marker='x',s=100)
# plot predicted labels
for i in range(output.size):
plt.scatter(feats_test[2,i],feats_test[3,i],color=c[int32(output[i])],marker='o',s=30*output_certainty[i])
plt.show()
```

In [25]:

```
train_feats,train_labels,test_feats=create_toy_classification_dataset(20,True)
```

Next, we supply necessary parameters to the CART algorithm and use it train our decision tree.

In [26]:

```
from modshogun import PT_MULTICLASS, CARTree
from numpy import array
def train_carttree(feat_types,problem_type,num_folds,use_cv_pruning,labels,features):
# create CART tree object
c = CARTree(feat_types,problem_type,num_folds,use_cv_pruning)
# set training labels
c.set_labels(labels)
# train using training features
c.train(features)
return c
# form feature types True for nominal (attribute X), False for ordinal/continuous (attribute Y)
ft = array([True, False])
# get back trained tree
cart = train_carttree(ft, PT_MULTICLASS, 5, True, train_labels, train_feats)
```

In the above code snippet, we see four parameters being supplied to the CART tree object. `feat_types`

supplies knowledge of attribute types of training data to the CART algorithm and `problem_type`

specifies whether it is a multiclass classification problem (`PT_MULTICLASS`

) or a regression problem (`PT_REGRESSION`

). The boolean parameter `use_cv_pruning`

switches on cross-validation pruning of the trained tree and `num_folds`

specifies the number of folds of cross-validation to be applied while pruning. At this point, let us divert ourselves briefly towards undertanding what kind of pruning strategy is employed by Shogun's CART implementation. The CART algorithm uses the cost-complexity pruning strategy. Cost-Complexity pruning yields a list of subtrees of varying depths using complexity normalized resubstitution error, $R_\alpha(T)$. Resubstitution error, R(T), measures how well a decision tree fits the training data. But, this measure favours larger trees over smaller ones. Hence the complexity normalized resubstitution error metric is used which adds penalty for increased complexity and in-turn counters overfitting.

$R_\alpha(T)=R(T)+\alpha \times (numleaves)$

The best subtree among the list of subtrees can be chosen using cross validation or using the best-fit metric in the validation dataset. Setting `use_cv_pruning`

in the above code snippet basically tells the CART object to use cross-validation to choose the best among the subtrees generated by cost-complexity pruning.

Let us now get back on track and use the trained tree to classify our test data.

In [27]:

```
from numpy import int32
# get output labels
output_labels = cart.apply_multiclass(test_feats)
plot_toy_classification_results(train_feats,train_labels,test_feats,output_labels)
```

In [28]:

```
from modshogun import RegressionLabels, RealFeatures
from numpy import random, sin, linspace
import matplotlib.pyplot as plt
% matplotlib inline
def create_toy_regression_dataset(nsamples,noise_var):
# randomly choose positions in X axis between 0 to 16
samples_x = random.rand(1,nsamples)*16
# find out y (=sin(x)) values for the sampled x positions and add noise to it
samples_y = sin(samples_x)+(random.rand(1,nsamples)-0.5)*noise_var
# plot the samples
plt.scatter(samples_x,samples_y,color='b',marker='x')
# create training features
train_feats = RealFeatures(samples_x)
# training labels
train_labels = RegressionLabels(samples_y[0,:])
return (train_feats,train_labels)
# plot the reference sinusoid
def plot_ref_sinusoid():
plot_x = linspace(-2,18,100)
plt.plot(plot_x,sin(plot_x),color='y',linewidth=1.5)
plt.xlabel('Feature values')
plt.ylabel('Labels')
plt.xlim([-3,19])
plt.ylim([-1.5,1.5])
# number of samples is 300, noise variance is 0.5
train_feats,train_labels = create_toy_regression_dataset(300,0.5)
plot_ref_sinusoid()
plt.show()
```

Next, we train our CART-tree.

In [29]:

```
from modshogun import PT_REGRESSION
from numpy import array
# feature type - continuous
feat_type = array([False])
# get back trained tree
cart = train_carttree(feat_type, PT_REGRESSION, 5, True, train_labels, train_feats)
```

In [30]:

```
def plot_predicted_sinusoid(cart):
# regression range - 0 to 16
x_test = array([linspace(0,16,100)])
# form Shogun features
test_feats = RealFeatures(x_test)
# apply regression using our previously trained CART-tree
regression_output = cart.apply_regression(test_feats).get_labels()
# plot the result
plt.plot(x_test[0,:],regression_output,linewidth=2.0)
# plot reference sinusoid
plot_ref_sinusoid()
plt.show()
plot_predicted_sinusoid(cart)
```

`n`

subsets where `n`

is a user controlled parameter. We perform `n`

iterations of training and testing in which, at each iteration, we choose one of the `n`

subsets as our test dataset and the remaining `n-1`

subsets as our training dataset. The performance of the model is usually taken as the average of the performances in various iterations. Shogun's cross validation class makes it really easy to apply cross-validation to any model of our choice. Let us realize this by applying cross-validation to CART-tree trained over Iris dataset. We start by reading the data.

In [31]:

```
import csv
from numpy import array
import matplotlib.pylab as plt
% matplotlib inline
# dictionary to encode class names to class labels
to_label = {'Iris-setosa' : 0.0, 'Iris-versicolor' : 1.0, 'Iris-virginica' : 2.0}
# read csv file and separate out labels and features
lab = []
feat = []
with open( os.path.join(SHOGUN_DATA_DIR, 'uci/iris/iris.data')) as csvfile:
csvread = csv.reader(csvfile,delimiter=',')
for row in csvread:
feat.append([float(i) for i in row[0:4]])
lab.append(to_label[row[4]])
lab = array(lab)
feat = array(feat).T
# plot the dataset using two highly correlated attributes
c = ['r', 'g', 'b']
for i in range(3):
plt.scatter(feat[2,lab==i],feat[3,lab==i],color=c[i],marker='x')
plt.show()
```

Next, we setup the model which is CART-tree in this case.

In [32]:

```
from modshogun import CARTree, PT_MULTICLASS
# set attribute types - all continuous
feature_types = array([False, False, False, False])
# setup CART-tree with cross validation pruning switched off
cart = CARTree(feature_types,PT_MULTICLASS,5,False)
```

Finally we can use Shogun's cross-validation class to get performance.

In [33]:

```
from modshogun import RealFeatures, MulticlassLabels
from modshogun import CrossValidation, MulticlassAccuracy, CrossValidationSplitting, CrossValidationResult
# training features
feats_train = RealFeatures(feat)
# training labels
labels_train = MulticlassLabels(lab)
# set evaluation criteria - multiclass accuracy
accuracy = MulticlassAccuracy()
# set splitting criteria - 10 fold cross-validation
split = CrossValidationSplitting(labels_train,10)
# set cross-validation parameters
cross_val = CrossValidation(cart,feats_train,labels_train,split,accuracy,False)
# run cross-validation multiple times - to get better estimate of accuracy
cross_val.set_num_runs(10)
# get cross validation result
result = cross_val.evaluate()
# print result
print('Mean Accuracy : ' + str(CrossValidationResult.obtain_from_generic(result).mean))
```

In [34]:

```
from numpy import array
# dictionary to convert string features to integer values
to_int = {'A' : 1, 'B' : 2, 'C' : 3, 'D' : 4, 'E' : 5}
# read csv file and separate out labels and features
lab = []
feat = []
with open( os.path.join(SHOGUN_DATA_DIR, 'uci/servo/servo.data')) as csvfile:
csvread = csv.reader(csvfile,delimiter=',')
for row in csvread:
feat.append([to_int[row[0]], to_int[row[1]], float(row[2]), float(row[3])])
lab.append(float(row[4]))
lab = array(lab)
feat = array(feat).T
```

In [35]:

```
from modshogun import CARTree, RegressionLabels, PT_REGRESSION, MeanSquaredError
from modshogun import CrossValidation, CrossValidationSplitting, CrossValidationResult
# form training features
feats_train = RealFeatures(feat)
# form training labels
labels_train = RegressionLabels(lab)
def get_cv_error(max_depth):
# set attribute types - 2 nominal and 2 ordinal
feature_types = array([True, True, False, False])
# setup CART-tree with cross validation pruning switched off
cart = CARTree(feature_types,PT_REGRESSION,5,False)
# set max allowed depth
cart.set_max_depth(max_depth)
# set evaluation criteria - mean squared error
accuracy = MeanSquaredError()
# set splitting criteria - 10 fold cross-validation
split = CrossValidationSplitting(labels_train,10)
# set cross-validation parameters
cross_val = CrossValidation(cart,feats_train,labels_train,split,accuracy,False)
# run cross-validation multiple times
cross_val.set_num_runs(10)
# return cross validation result
return CrossValidationResult.obtain_from_generic(cross_val.evaluate()).mean
```

`max_depth`

values to the above method and plot the returned cross-validated errors.

In [36]:

```
import matplotlib.pyplot as plt
cv_errors = [get_cv_error(i) for i in range(1,15)]
plt.plot(range(1,15),cv_errors,'bo',range(1,15),cv_errors,'k')
plt.xlabel('max_allowed_depth')
plt.ylabel('cross-validated error')
plt.ylim(0,1.2)
plt.show()
```

`average_error/range_of_labels`

comes out to be ~30%.

The CHAID is an algorithm for decision tree learning proposed by Kass (1980). It is similar in functionality to CART in the sense that both can be used for classification as well as regression. But unlike CART, CHAID internally handles only categorical features. The continuous features are first converted into ordinal categorical features for the CHAID algorithm to be able to use them. This conversion is done by binning of feature values.The number of bins (K) has to be supplied by the user. Given K, a predictor is split in such a way that all the bins get the same number (more or less) of distinct predictor values. The maximum feature value in each bin is used as a breakpoint.

An important parameter in the CHAID tree growing process is the p-value. The p-value is the metric that is used for deciding which categories of predictor values to merge during merging as well as for deciding the best attribute during splitting. The p-value is calculated using different hypothesis testing methods depending on the type of dependent variable (nominal, ordinal or continuous). A more detailed discussion on the CHAID algorithm can be found in the documentation of the `CCHAIDTree`

class in Shogun. Let us move on to a more interesting topic which is learning to use CHAID using Shogun's python API.

In [37]:

```
train_feats,train_labels,test_feats = create_toy_classification_dataset(20,True)
```

Now, we set up our CHAID-tree with appropriate parameters and train over given data.

In [38]:

```
from modshogun import PT_MULTICLASS, CHAIDTree
from numpy import array, dtype, int32
def train_chaidtree(dependent_var_type,feature_types,num_bins,features,labels):
# create CHAID tree object
c = CHAIDTree(dependent_var_type,feature_types,num_bins)
# set training labels
c.set_labels(labels)
# train using training features
c.train(features)
return c
# form feature types 0 for nominal (attribute X), 2 for continuous (attribute Y)
ft = array([0, 2],dtype=int32)
# cache training matrix
train_feats_cache=RealFeatures(train_feats.get_feature_matrix())
# get back trained tree - dependent variable type is nominal (hence 0), number of bins for binning is 10
chaid = train_chaidtree(0,ft,10,train_feats,train_labels)
print('updated_matrix')
print(train_feats.get_feature_matrix())
print('')
print('original_matrix')
print(train_feats_cache.get_feature_matrix())
```

An important point to be noted in the above code snippet is that CHAID training modifies the training data. The actual continuous feature values are replaced by the discrete ordinal values obtained during continuous to ordinal conversion. Notice the difference between the original feature matrix and the updated matrix. The updated matrix contains only 10 distinct values denoting all values of the original matrix for feature dimension at row index 1.

With a CHAID-trained decision tree at our disposal, it's time to apply it to colour our test points.

In [39]:

```
# get output labels
output_labels = chaid.apply_multiclass(test_feats)
plot_toy_classification_results(train_feats_cache,train_labels,test_feats,output_labels)
```

In [40]:

```
train_feats,train_labels = create_toy_regression_dataset(300,0.5)
plot_ref_sinusoid()
plt.show()
```

As usual, we start by setting up our decision tree and training it.

In [41]:

```
from numpy import dtype, int32, array
# feature type - continuous
feat_type = array([2],dtype=int32)
# get back trained tree
chaid = train_chaidtree(2,feat_type, 50, train_feats, train_labels)
```

Next, we use the trained decision tree to follow the reference sinusoid.

In [42]:

```
plot_predicted_sinusoid(chaid)
```

In [43]:

```
from modshogun import CSVFile, RealFeatures, MulticlassLabels
train_feats=RealFeatures(CSVFile( os.path.join(SHOGUN_DATA_DIR, 'uci/wine/fm_wine.dat')))
train_labels=MulticlassLabels(CSVFile( os.path.join(SHOGUN_DATA_DIR, 'uci/wine/label_wine.dat')))
```

In [44]:

```
from modshogun import CHAIDTree, MulticlassLabels
# set attribute types - all attributes are continuous(2)
feature_types = array([2 for i in range(13)],dtype=int32)
# setup CHAID tree - dependent variable is nominal(0), feature types set, number of bins(20)
chaid = CHAIDTree(0,feature_types,20)
```

In [45]:

```
# set up cross validation class
from modshogun import CrossValidation, CrossValidationSplitting, CrossValidationResult, MulticlassAccuracy
# set evaluation criteria - multiclass accuracy
accuracy = MulticlassAccuracy()
# set splitting criteria - 10 fold cross-validation
split = CrossValidationSplitting(train_labels,10)
# set cross-validation parameters
cross_val = CrossValidation(chaid,train_feats,train_labels,split,accuracy,False)
# run cross-validation multiple times
cross_val.set_num_runs(10)
print('Mean classification accuracy : '+str(CrossValidationResult.obtain_from_generic(cross_val.evaluate()).mean*100)+' %')
```

In [46]:

```
from modshogun import CSVFile, RealFeatures, RegressionLabels
from numpy import ptp
train_feats=RealFeatures(CSVFile( os.path.join(SHOGUN_DATA_DIR, 'uci/housing/fm_housing.dat')))
train_labels=RegressionLabels(CSVFile( os.path.join(SHOGUN_DATA_DIR, 'uci/housing/housing_label.dat')))
# print range of regression labels - this is useful for calculating relative deviation later
print('labels range : '+str(ptp(train_labels.get_labels())))
```

Next, we set up the parameters for the CHAID tree as well as the cross-validation class.

In [47]:

```
from modshogun import CHAIDTree, MeanSquaredError
from modshogun import CrossValidation, CrossValidationSplitting, CrossValidationResult
from numpy import array, dtype, int32
def get_cv_error(max_depth):
# set feature types - all continuous(2) except 4th column which is nominal(0)
feature_types = array([2]*13,dtype=int32)
feature_types[3]=0
feature_types[8]=1
feature_types[9]=1
# setup CHAID-tree
chaid = CHAIDTree(2,feature_types,10)
# set max allowed depth
chaid.set_max_tree_depth(max_depth)
# set evaluation criteria - mean squared error
accuracy = MeanSquaredError()
# set splitting criteria - 5 fold cross-validation
split = CrossValidationSplitting(train_labels,5)
# set cross-validation parameters
cross_val = CrossValidation(chaid,train_feats,train_labels,split,accuracy,False)
# run cross-validation multiple times
cross_val.set_num_runs(3)
# return cross validation result
return CrossValidationResult.obtain_from_generic(cross_val.evaluate()).mean
```

In [48]:

```
import matplotlib.pyplot as plt
% matplotlib inline
cv_errors = [get_cv_error(i) for i in range(1,10)]
plt.plot(range(1,10),cv_errors,'bo',range(1,10),cv_errors,'k')
plt.xlabel('max_allowed_depth')
plt.ylabel('cross-validated error')
plt.show()
```

[1] Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science

[2] Quinlan, J. R. 1986. Induction of Decision Trees. Mach. Learn. 1: 1 (Mar. 1986), 81-106