|
SHOGUN
4.2.0
|
This page lists ready to run shogun examples for the Python Modular interface.
To run the examples issue
python name_of_example.py
# In this example the Averaged Perceptron used to classify toy data.
#!/usr/bin/env python
from numpy import *
parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]]
def classifier_averaged_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1):
from modshogun import RealFeatures, BinaryLabels
from modshogun import AveragedPerceptron
random.seed(seed)
# produce some (probably) linearly separable training data by hand
# Two Gaussians at a far enough distance
X=array(random.randn(dim,n))+distance
Y=array(random.randn(dim,n))-distance
X_test=array(random.randn(dim,n))+distance
Y_test=array(random.randn(dim,n))-distance
label_train_twoclass=hstack((ones(n), -ones(n)))
#plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o')
fm_train_real=hstack((X,Y))
fm_test_real=hstack((X_test,Y_test))
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=BinaryLabels(label_train_twoclass)
perceptron=AveragedPerceptron(feats_train, labels)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
# only guaranteed to converge for separable data
perceptron.train()
perceptron.set_features(feats_test)
out_labels = perceptron.apply().get_labels()
return perceptron, out_labels
if __name__=='__main__':
print('AveragedPerceptron')
classifier_averaged_perceptron_modular(*parameter_list[0])
# This example shows how to use a custom defined kernel function for training a
# two class Support Vector Machine (SVM) classifier on randomly generated
# examples. The SVM regularization constant is set to C=1.
#!/usr/bin/env python
parameter_list = [[1,7],[2,8]]
def classifier_custom_kernel_modular (C=1,dim=7):
from modshogun import RealFeatures, BinaryLabels, CustomKernel, LibSVM
from numpy import diag,ones,sign
from numpy.random import rand,seed
seed((C,dim))
lab=sign(2*rand(dim) - 1)
data=rand(dim, dim)
symdata=data*data.T + diag(ones(dim))
kernel=CustomKernel()
kernel.set_full_kernel_matrix_from_full(data)
labels=BinaryLabels(lab)
svm=LibSVM(C, kernel, labels)
svm.train()
predictions =svm.apply()
out=svm.apply().get_labels()
return svm,out
if __name__=='__main__':
print('custom_kernel')
classifier_custom_kernel_modular(*parameter_list[0])
# In this example we demonstrate how to use SVMs in a domain adaptation
# scenario. Here, we assume that we have two problem domains, one with
# an abundance of training data (source domain) and one with only a few
# training examples (target domain). These domains are assumed to be
# different but related enough to transfer information between them.
# Thus, we first train an SVM on the source domain and then subsequently
# pass this previously trained SVM object to the DASVM, that we train
# on the target domain. The DASVM internally computes a custom linear term
# (for the underlying quadratic program of the dual formulation of the SVM)
# based on the support vectors of the source SVM and the training examples
# of the target SVM. Finally, it can be used for prediction just as any other
# SVM object.
#
#!/usr/bin/env python
import numpy
from modshogun import StringCharFeatures, BinaryLabels, DNA
from modshogun import WeightedDegreeStringKernel
from modshogun import MSG_DEBUG
try:
from modshogun import DomainAdaptationSVM
except ImportError:
print("DomainAdaptationSVM not available")
exit(0)
try:
from modshogun import SVMLight
except ImportError:
print("SVMLight not available")
exit(0)
traindna = ['CGCACGTACGTAGCTCGAT',
'CGACGTAGTCGTAGTCGTA',
'CGACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACCACAGTTATATAGTA',
'CGACGTAGTCGTAGTCGTA',
'CGACGTAGTTTTTTTCGTA',
'CGACGTAGTCGTAGCCCCA',
'CAAAAAAAAAAAAAAAATA',
'CGACGGGGGGGGGGGCGTA']
label_traindna = numpy.array(5*[-1.0] + 5*[1.0])
testdna = ['AGCACGTACGTAGCTCGAT',
'AGACGTAGTCGTAGTCGTA',
'CAACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGAACACAGTTATATAGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACGTGGGGTTTTTCGTA',
'CGACGTAGTCCCAGCCCCA',
'CAAAAAAAAAAAACCAATA',
'CGACGGCCGGGGGGGCGTA']
label_testdna = numpy.array(5*[-1.0] + 5*[1.0])
traindna2 = ['AGACAGTCAGTCGATAGCT',
'AGCAGTCGTAGTCGTAGTC',
'AGCAGGGGGGGGGGTAGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCAACACGTTCTCTCGTC',
'AGCAGTCGTAGTCGTAGTC',
'AGCAGTCGTTTTTTTAGTC',
'AGCAGTCGTAGTCGAAAAC',
'ACCCCCCCCCCCCCCCCTC',
'AGCAGGGGGGGGGGGAGTC']
label_traindna2 = numpy.array(5*[-1.0] + 5*[1.0])
testdna2 = ['CGACAGTCAGTCGATAGCT',
'CGCAGTCGTAGTCGTAGTC',
'ACCAGGGGGGGGGGTAGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCCACACGTTCTCTCGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCAGTGGGGTTTTTAGTC',
'AGCAGTCGTAAACGAAAAC',
'ACCCCCCCCCCCCAACCTC',
'AGCAGGAAGGGGGGGAGTC']
label_testdna2 = numpy.array(5*[-1.0] + 5*[1.0])
parameter_list = [[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
testdna2,label_testdna2,1,3],[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
testdna2,label_testdna2,2,5]]
def classifier_domainadaptationsvm_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
label_train_dna=label_traindna, \
label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):
feats_train = StringCharFeatures(fm_train_dna, DNA)
feats_test = StringCharFeatures(fm_test_dna, DNA)
kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels = BinaryLabels(label_train_dna)
svm = SVMLight(C, kernel, labels)
svm.train()
#svm.io.set_loglevel(MSG_DEBUG)
#####################################
#print("obtaining DA SVM from previously trained SVM")
feats_train2 = StringCharFeatures(fm_train_dna, DNA)
feats_test2 = StringCharFeatures(fm_test_dna, DNA)
kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels2 = BinaryLabels(label_train_dna)
# we regularize against the previously obtained solution
dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
dasvm.train()
out = dasvm.apply_binary(feats_test2)
return out #,dasvm TODO
if __name__=='__main__':
print('SVMLight')
classifier_domainadaptationsvm_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
try:
from modshogun import FeatureBlockLogisticRegression
except ImportError:
print("FeatureBlockLogisticRegression not available")
exit(0)
features = RealFeatures(hstack((traindat,traindat)))
labels = BinaryLabels(hstack((label_train,label_train)))
n_features = features.get_num_features()
block_one = IndexBlock(0,n_features//2)
block_two = IndexBlock(n_features//2,n_features)
block_group = IndexBlockGroup()
block_group.add_block(block_one)
block_group.add_block(block_two)
mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group)
mtlr.set_regularization(1) # use regularization ratio
mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
mtlr.train()
out = mtlr.apply().get_labels()
return out
if __name__=='__main__':
print('FeatureBlockLogisticRegression')
classifier_featureblock_logistic_regression(*parameter_list[0])
# In this example a multi-class support vector machine is trained on a toy data
# set and the trained classifier is then used to predict labels of test
# examples. The training algorithm is based on BSVM formulation (L2-soft margin
# and the bias added to the objective function) which is solved by the Improved
# Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian
# kernel of width 2.1 and the regularization constant C=1. The solver stops if the
# relative duality gap falls below 1e-5.
#
# For more details on the used SVM solver see
# V.Franc: Optimization Algorithms for Kernel Methods. Research report.
# CTU-CMP-2005-22. CTU FEL Prague. 2005.
# ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf .
#
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_gmnpsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, MulticlassLabels
from modshogun import GaussianKernel, GMNPSVM, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=MulticlassLabels(CSVFile(label_fname))
kernel=GaussianKernel(feats_train, feats_train, width)
svm=GMNPSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train(feats_train)
out=svm.apply(feats_test).get_labels()
return out,kernel
if __name__=='__main__':
print('GMNPSVM')
classifier_gmnpsvm_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm Gradient Projection Decomposition Technique
# (GPDT) is used with SVM regularization parameter C=1 and a Gaussian
# kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution.
#
# For more details on GPDT solver see http://dm.unife.it/gpdt .
#
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_gpbtsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, BinaryLabels
from modshogun import GaussianKernel
from modshogun import CSVFile
try:
from modshogun import GPBTSVM
except ImportError:
print("GPBTSVM not available")
exit(0)
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
kernel=GaussianKernel(feats_train, feats_train, width)
svm=GPBTSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('GPBTSVM')
classifier_gpbtsvm_modular(*parameter_list[0])
# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm the LaRank algorithm is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision
# set to epsilon=1e-5.
#
# For more details on LaRank see
# Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J.
# Solving MultiClass Support Vector Machines with LaRank. ICML 2007.
#
#!/usr/bin/env python
from numpy import *
parameter_list = [[10,3,15,0.9,1,2000,1],[20,4,15,0.9,1,5000,2]]
def classifier_larank_modular (num_vec,num_class,distance,C=0.9,num_threads=1,num_iter=5,seed=1):
from modshogun import RealFeatures, MulticlassLabels
from modshogun import GaussianKernel
from modshogun import LaRank
from modshogun import Math_init_random
# reproducible results
Math_init_random(seed)
random.seed(seed)
# generate some training data where each class pair is linearly separable
label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
fm_train=array(random.randn(num_class,num_vec))
fm_test=array(random.randn(num_class,num_vec))
for i in range(len(label_train)):
fm_train[label_train[i],i]+=distance
fm_test[label_test[i],i]+=distance
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
epsilon=1e-5
labels=MulticlassLabels(label_train)
svm=LaRank(C, kernel, labels)
#svm.set_tau(1e-3)
svm.set_batch_mode(False)
#svm.io.enable_progress()
svm.set_epsilon(epsilon)
svm.train()
out=svm.apply(feats_test).get_labels()
predictions = svm.apply()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('LaRank')
[predictions, svm, labels] = classifier_larank_modular(*parameter_list[0])
# In this example a two-class linear classifier based on the Linear Discriminant
# Analysis (LDA) is trained on a toy data set and then the trained classifier is
# used to predict test examples. The regularization parameter, which corresponds
# to a weight of a unitary matrix added to the covariance matrix, is set to
# gamma=3.
#
# For more details on the LDA see e.g.
# http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,3,1],[traindat,testdat,label_traindat,4,1]]
def classifier_lda_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,gamma=3,num_threads=1):
from modshogun import RealFeatures, BinaryLabels, LDA, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
lda=LDA(gamma, feats_train, labels)
lda.train()
bias=lda.get_bias()
w=lda.get_w()
predictions = lda.apply(feats_test).get_labels()
return lda,predictions
if __name__=='__main__':
print('LDA')
classifier_lda_modular(*parameter_list[0])
# In this example a one-class support vector machine classifier is trained on a
# toy data set. The training algorithm finds a hyperplane in the RKHS which
# separates the training data from the origin. The one-class classifier is
# typically used to estimate the support of a high-dimesnional distribution.
# For more details see e.g.
# B. Schoelkopf et al. Estimating the support of a high-dimensional
# distribution. Neural Computation, 13, 2001, 1443-1471.
#
# In the example, the one-class SVM is trained by the LIBSVM solver with the
# regularization parameter C=1 and the Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,2.2,1,1e-7],[traindat,testdat,2.1,1,1e-5]]
def classifier_libsvmoneclass_modular (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=GaussianKernel(feats_train, feats_train, width)
svm=LibSVMOneClass(C, kernel)
svm.set_epsilon(epsilon)
svm.train()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('LibSVMOneClass')
classifier_libsvmoneclass_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Minimal Primal Dual SVM is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 1.2 and the
# precision parameter 1e-5.
#
# For more details on the MPD solver see
# Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple
# Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell,
# J. G., J. Siekmann, Springer, Berlin, Germany (11 2005)
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,1,1e-5],[traindat,testdat,label_traindat,0.9,1e-5]]
def classifier_mpdsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=1,epsilon=1e-5):
from modshogun import RealFeatures, BinaryLabels
from modshogun import GaussianKernel
from modshogun import MPDSVM, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
svm=MPDSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('MPDSVM')
classifier_mpdsvm_modular(*parameter_list[0])
#!/usr/bin/env python
import re
import time
from tools.multiclass_shared import prepare_data
# run with toy data
[traindat, label_traindat, testdat, label_testdat] = prepare_data()
# run with opt-digits if available
#[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)
parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5]]
def classifier_multiclass_ecoc (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,lawidth=2.1,C=1,epsilon=1e-5):
import modshogun
from modshogun import ECOCStrategy, LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
from modshogun import MulticlassAccuracy
from modshogun import RealFeatures, MulticlassLabels
def nonabstract_class(name):
try:
getattr(modshogun, name)()
except TypeError:
return False
return True
encoders = [x for x in dir(modshogun)
if re.match(r'ECOC.+Encoder', x) and nonabstract_class(x)]
decoders = [x for x in dir(modshogun)
if re.match(r'ECOC.+Decoder', x) and nonabstract_class(x)]
fea_train = RealFeatures(fm_train_real)
fea_test = RealFeatures(fm_test_real)
gnd_train = MulticlassLabels(label_train_multiclass)
if label_test_multiclass is None:
gnd_test = None
else:
gnd_test = MulticlassLabels(label_test_multiclass)
base_classifier = LibLinear(L2R_L2LOSS_SVC)
base_classifier.set_bias_enabled(True)
#print('Testing with %d encoders and %d decoders' % (len(encoders), len(decoders)))
#print('-' * 70)
#format_str = '%%15s + %%-10s %%-10%s %%-10%s %%-10%s'
#print((format_str % ('s', 's', 's')) % ('encoder', 'decoder', 'codelen', 'time', 'accuracy'))
def run_ecoc(ier, idr):
encoder = getattr(modshogun, encoders[ier])()
decoder = getattr(modshogun, decoders[idr])()
# whether encoder is data dependent
if hasattr(encoder, 'set_labels'):
encoder.set_labels(gnd_train)
encoder.set_features(fea_train)
strategy = ECOCStrategy(encoder, decoder)
classifier = LinearMulticlassMachine(strategy, fea_train, base_classifier, gnd_train)
classifier.train()
label_pred = classifier.apply(fea_test)
if gnd_test is not None:
evaluator = MulticlassAccuracy()
acc = evaluator.evaluate(label_pred, gnd_test)
else:
acc = None
return (classifier.get_num_machines(), acc)
for ier in range(len(encoders)):
for idr in range(len(decoders)):
t_begin = time.clock()
(codelen, acc) = run_ecoc(ier, idr)
if acc is None:
acc_fmt = 's'
acc = 'N/A'
else:
acc_fmt = '.4f'
t_elapse = time.clock() - t_begin
#print((format_str % ('d', '.3f', acc_fmt)) %
# (encoders[ier][4:-7], decoders[idr][4:-7], codelen, t_elapse, acc))
if __name__=='__main__':
print('MulticlassECOC')
classifier_multiclass_ecoc(*parameter_list[0])
#!/usr/bin/env python
from tools.multiclass_shared import prepare_data
[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)
parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]]
def classifier_multiclassliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, MulticlassLabels
from modshogun import MulticlassLibLinear
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=MulticlassLabels(label_train_multiclass)
classifier = MulticlassLibLinear(C,feats_train,labels)
classifier.train()
label_pred = classifier.apply(feats_test)
out = label_pred.get_labels()
if label_test_multiclass is not None:
from modshogun import MulticlassAccuracy
labels_test = MulticlassLabels(label_test_multiclass)
evaluator = MulticlassAccuracy()
acc = evaluator.evaluate(label_pred, labels_test)
print('Accuracy = %.4f' % acc)
return out
if __name__=='__main__':
print('MulticlassLibLinear')
classifier_multiclassliblinear_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.multiclass_shared import prepare_data
[traindat, label_traindat, testdat, label_testdat] = prepare_data()
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, MulticlassLabels
from modshogun import GaussianKernel
from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=MulticlassLabels(label_train_multiclass)
classifier = LibSVM()
classifier.set_epsilon(epsilon)
#print labels.get_labels()
mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels)
mc_classifier.train()
kernel.init(feats_train, feats_test)
out = mc_classifier.apply().get_labels()
return out
if __name__=='__main__':
print('MulticlassMachine')
classifier_multiclassmachine_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import *
parameter_list = [[10,3,15,2.1,1,1e-5,1],[20,4,15,2.2,2,1e-5,2]]
def classifier_multiclassocas_modular (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1):
from modshogun import RealFeatures, MulticlassLabels
from modshogun import Math_init_random
try:
from modshogun import MulticlassOCAS
except ImportError:
print("MulticlassOCAS not available")
return
# reproducible results
random.seed(seed)
Math_init_random(seed)
# generate some training data where each class pair is linearly separable
label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
fm_train=array(random.randn(num_class,num_vec))
fm_test=array(random.randn(num_class,num_vec))
for i in range(len(label_train)):
fm_train[label_train[i],i]+=distance
fm_test[label_test[i],i]+=distance
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
labels=MulticlassLabels(label_train)
classifier = MulticlassOCAS(C,feats_train,labels)
classifier.train()
out = classifier.apply(feats_test).get_labels()
#print label_test
#print out
return out,classifier
if __name__=='__main__':
print('MulticlassOCAS')
classifier_multiclassocas_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.multiclass_shared import prepare_data
[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)
parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]]
def classifier_multilabeloutputliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
from modshogun import RealFeatures, MulticlassLabels, MultilabelLabels
from modshogun import MulticlassLibLinear
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=MulticlassLabels(label_train_multiclass)
classifier = MulticlassLibLinear(C,feats_train,labels)
classifier.train()
label_pred = classifier.apply_multilabel_output(feats_test,2)
out = label_pred.get_labels()
#print out
return out
if __name__=='__main__':
print('MultilabelOutputLibLinear')
classifier_multilabeloutputliblinear_modular(*parameter_list[0])
# This example shows usage of the Perceptron algorithm for training a two-class
# linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm works by
# iteratively passing though the training examples and applying the update rule on
# those examples which are misclassified by the current classifier. The Perceptron
# update rule reads
#
# w(t+1) = w(t) + alpha * y_t * x_t
# b(t+1) = b(t) + alpha * y_t
#
# where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example
# (w(t),b(t)) are the current parameters of the linear classifier
# (w(t+1),b(t+1)) are the new parameters of the linear classifier
# alpha is the learning rate; in this examples alpha=1
#
# The Perceptron algorithm iterates until all training examples are correctly
# classified or the prescribed maximal number of iterations, in this example
# max_iter=1000, is reached.
#!/usr/bin/env python
from numpy import *
parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]]
def classifier_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1):
from modshogun import RealFeatures, BinaryLabels
from modshogun import Perceptron
random.seed(seed)
# produce some (probably) linearly separable training data by hand
# Two Gaussians at a far enough distance
X=array(random.randn(dim,n))+distance
Y=array(random.randn(dim,n))-distance
X_test=array(random.randn(dim,n))+distance
Y_test=array(random.randn(dim,n))-distance
label_train_twoclass=hstack((ones(n), -ones(n)))
#plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o')
fm_train_real=hstack((X,Y))
fm_test_real=hstack((X_test,Y_test))
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=BinaryLabels(label_train_twoclass)
perceptron=Perceptron(feats_train, labels)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
# only guaranteed to converge for separable data
perceptron.train()
perceptron.set_features(feats_test)
out_labels = perceptron.apply().get_labels()
return perceptron, out_labels
if __name__=='__main__':
print('Perceptron')
classifier_perceptron_modular(*parameter_list[0])
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2014 Soumyajit De
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1,5,0.9]]
def classifier_ssk_modular (fm_train_dna=traindat,fm_test_dna=testdat,
label_train_dna=label_traindat,C=1,maxlen=1,decay=1):
from modshogun import StringCharFeatures, BinaryLabels
from modshogun import LibSVM, SubsequenceStringKernel, DNA
from modshogun import ErrorRateMeasure
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
labels=BinaryLabels(label_train_dna)
kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay);
svm=LibSVM(C, kernel, labels);
svm.train();
out=svm.apply(feats_train);
evaluator = ErrorRateMeasure()
trainerr = evaluator.evaluate(out,labels)
# print(trainerr)
kernel.init(feats_train, feats_test)
predicted_labels=svm.apply(feats_test).get_labels()
# print predicted_labels
return predicted_labels
if __name__=='__main__':
print('SringSubsequenceKernel classification DNA')
classifier_ssk_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and
# a precision parameter epsilon=1e-5. The LINADD trick is used to speed up
# training.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel and the LINADD trick see
# Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String
# Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston,
# Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press,
# Cambridge, MA. 2007.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
train_dna=lm.load_dna('../data/fm_train_dna.dat')
test_dna=lm.load_dna('../data/fm_test_dna.dat')
label=lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[train_dna, test_dna, label, 20, 0.9, 1e-7, 1],
[train_dna, test_dna, label, 20, 2.3, 1e-7, 4]]
def classifier_svmlight_batch_linadd_modular (fm_train_dna, fm_test_dna,
label_train_dna, degree, C, epsilon, num_threads):
from modshogun import StringCharFeatures, BinaryLabels, DNA
from modshogun import WeightedDegreeStringKernel, MSG_DEBUG
try:
from modshogun import SVMLight
except ImportError:
print('No support for SVMLight available.')
return
feats_train=StringCharFeatures(DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=BinaryLabels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
#print('SVMLight Objective: %f num_sv: %d' % \)
# (svm.get_objective(), svm.get_num_support_vectors())
svm.set_batch_computation_enabled(False)
svm.set_linadd_enabled(False)
svm.apply().get_labels()
svm.set_batch_computation_enabled(True)
labels = svm.apply().get_labels()
return labels, svm
if __name__=='__main__':
print('SVMlight batch')
classifier_svmlight_batch_linadd_modular(*parameter_list[0])
# This example demonstrates how to train an SVMLight classifier
# using a custom linear term. This is used in the class DASVM that
# pre-computes this linear term using a previously trained SVM.
#
#!/usr/bin/env python
import numpy
traindna=['CGCACGTACGTAGCTCGAT',
'CGACGTAGTCGTAGTCGTA',
'CGACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACCACAGTTATATAGTA',
'CGACGTAGTCGTAGTCGTA',
'CGACGTAGTTTTTTTCGTA',
'CGACGTAGTCGTAGCCCCA',
'CAAAAAAAAAAAAAAAATA',
'CGACGGGGGGGGGGGCGTA']
label_traindna=numpy.array(5*[-1.0] + 5*[1.0])
testdna=['AGCACGTACGTAGCTCGAT',
'AGACGTAGTCGTAGTCGTA',
'CAACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGAACACAGTTATATAGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACGTGGGGTTTTTCGTA',
'CGACGTAGTCCCAGCCCCA',
'CAAAAAAAAAAAACCAATA',
'CGACGGCCGGGGGGGCGTA']
label_test_dna=numpy.array(5*[-1.0] + 5*[1.0])
parameter_list = [[traindna,testdna,label_traindna,3,10,1e-5,1],[traindna,testdna,label_traindna,3,10,1e-5,1]]
def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
label_train_dna=label_traindna,degree=3, \
C=10,epsilon=1e-5,num_threads=1):
from modshogun import StringCharFeatures, BinaryLabels, DNA
from modshogun import WeightedDegreeStringKernel
try:
from modshogun import SVMLight
except ImportError:
print("SVMLight is not available")
exit(0)
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=BinaryLabels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_qpsize(3)
svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
out = svm.apply().get_labels()
return out,kernel
if __name__=='__main__':
print('SVMLight')
classifier_svmlight_linear_term_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel see
# G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
# spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]]
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
from modshogun import StringCharFeatures, BinaryLabels, DNA
from modshogun import WeightedDegreeStringKernel
try:
from modshogun import SVMLight
except ImportError:
print('No support for SVMLight available.')
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=BinaryLabels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
svm.apply().get_labels()
return kernel
if __name__=='__main__':
print('SVMLight')
classifier_svmlight_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier (SVM) is
# trained on a toy data set and the trained classifier is used to predict labels
# of test examples. As training algorithm the SVMLIN solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameter epsilon=1e-5. The example also shows how to
# retrieve parameters (vector w and bias b)) of the trained linear classifier.
#
# For more details on the SVMLIN solver see
# V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised
# Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]
def classifier_svmlin_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
from modshogun import SVMLin, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
svm=SVMLin(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
bias=svm.get_bias()
w=svm.get_w()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('SVMLin')
classifier_svmlin_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the OCAS solver is used with the SVM
# regularization parameter C=0.9 and the bias term in the classification rule
# switched off and the precision parameter epsilon=1e-5 (duality gap).
#
# For more details on the OCAS solver see
# V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk
# Minimization.The Journal of Machine Learning Research, vol. 10,
# pp. 2157--2192. October 2009.
#
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]
def classifier_svmocas_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
from modshogun import RealFeatures, BinaryLabels
from modshogun import CSVFile
try:
from modshogun import SVMOcas
except ImportError:
print("SVMOcas not available")
return
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
svm=SVMOcas(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.train()
bias=svm.get_bias()
w=svm.get_w()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('SVMOcas')
classifier_svmocas_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is
# used with the SVM regularization parameter C=0.9. The number of iterations, i.e.
# passes though all training examples, is set to num_iter=5 .
#
# For more details on the SGD solver see
# L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT
# Press. 2008.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]]
def classifier_svmsgd_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5):
from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
from modshogun import SVMSGD, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=BinaryLabels(CSVFile(label_fname))
svm=SVMSGD(C, feats_train, labels)
svm.set_epochs(num_iter)
#svm.io.set_loglevel(0)
svm.train()
bias=svm.get_bias()
w=svm.get_w()
predictions = svm.apply(feats_test)
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('SVMSGD')
classifier_svmsgd_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,10],[data,20]]
def converter_diffusionmaps_modular (data_fname,t):
try:
from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile
features = RealFeatures(CSVFile(data_fname))
converter = DiffusionMaps()
converter.set_target_dim(1)
converter.set_kernel(GaussianKernel(10,10.0))
converter.set_t(t)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('DiffusionMaps')
converter_diffusionmaps_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]
def converter_factoranalysis_modular(data_fname):
try:
import numpy
from modshogun import RealFeatures, FactorAnalysis, EuclideanDistance, CSVFile
features = RealFeatures(CSVFile(data_fname))
converter = FactorAnalysis()
converter.set_target_dim(2)
embedding = converter.apply(features)
X = embedding.get_feature_matrix()
covdet = numpy.linalg.det(numpy.dot(X,X.T))
return covdet > 0
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('Factor Analysis')
converter_factoranalysis_modular(*parameter_list[0])
#!/usr/bin/env python
strings=['example document 1','example document 2','example document 3','example document 4']
parameter_list=[[strings]]
def converter_hasheddoc_modular(strings):
from modshogun import SparseRealFeatures, RAWBYTE, StringCharFeatures, Features, HashedDocDotFeatures
from modshogun import NGramTokenizer
from modshogun import HashedDocConverter
from numpy import array
#create string features
f=StringCharFeatures(strings, RAWBYTE)
#set the number of bits of the target dimension
#means a dim of size 2^5=32
num_bits=5
#create the ngram tokenizer of size 8 to parse the strings
tokenizer=NGramTokenizer(8)
#normalize results
normalize=True
#create converter
converter=HashedDocConverter(tokenizer, num_bits, normalize)
converted_feats=converter.apply(f)
#should expect 32
#print('Converted features\' space dimensionality is', converted_feats.get_dim_feature_space())
#print('Self dot product of string 0 with converted feats:', converted_feats.dot(0, converted_feats, 0))
hashed_feats=HashedDocDotFeatures(num_bits, f, tokenizer, normalize)
#print('Hashed features\' space dimensionality is', hashed_feats.get_dim_feature_space())
#print('Self dot product of string 0 with hashed feats:', hashed_feats.dot(0, hashed_feats, 0))
return converted_feats
if __name__=='__main__':
print('HashedDocConverter')
converter_hasheddoc_modular(*parameter_list[0])
# In this example toy data is being preprocessed using the Hessian Locally Linear Embedding algorithm
# as described in
#
# Donoho, D., & Grimes, C. (2003).
# Hessian eigenmaps: new tools for nonlinear dimensionality reduction.
# Proceedings of National Academy of Science (Vol. 100, pp. 5591-5596).
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_hessianlocallylinearembedding_modular (data_fname,k):
try:
from modshogun import RealFeatures, CSVFile
try:
from modshogun import HessianLocallyLinearEmbedding
except ImportError:
print("HessianLocallyLinearEmbedding not available")
exit(0)
features = RealFeatures(CSVFile(data))
converter = HessianLocallyLinearEmbedding()
converter.set_target_dim(1)
converter.set_k(k)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('HessianLocallyLinearEmbedding')
converter_hessianlocallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using the Isomap algorithm
# as described in
#
# Silva, V. D., & Tenenbaum, J. B. (2003).
# Global versus local methods in nonlinear dimensionality reduction.
# Advances in Neural Information Processing Systems 15, 15(Figure 2), 721-728. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.3407&rep=rep1&type=pdf
#
# Before applying to the data the landmark approximation is enabled with
# specified number of landmarks. The landmark approximation is described in
#
# Sparse multidimensional scaling using landmark points
# V De Silva, J B Tenenbaum (2004) Technology, p. 1-4
#
# After enabling the landmark approximation k parameter -- the number
# of neighbors in the k nearest neighbor graph -- is initialized.
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]
def converter_isomap_modular (data_fname):
from modshogun import RealFeatures, CSVFile
from modshogun import Isomap
features = RealFeatures(CSVFile(data))
converter = Isomap()
converter.set_k(20)
converter.set_target_dim(1)
converter.apply(features)
return features
if __name__=='__main__':
print('Isomap')
#converter_isomap_modular(*parameter_list[0])
# In this example toy data is being processed using kernel extension
# of the Locally Linear Embedding (LLE) algorithm as described in
#
# Kayo, O. (2006). Locally linear embedding algorithm. Extensions and applications. October.
# Retrieved from: http://herkules.oulu.fi/isbn9514280415/isbn9514280415.pd
#
# Linear kernel is used as kernel of the extension.
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_kernellocallylinearembedding_modular (data_fname,k):
try:
from modshogun import RealFeatures, LinearKernel, CSVFile
try:
from modshogun import KernelLocallyLinearEmbedding
except ImportError:
print("KernelLocallyLinearEmbedding not available")
exit(0)
features = RealFeatures(CSVFile(data_fname))
kernel = LinearKernel()
converter = KernelLocallyLinearEmbedding(kernel)
converter.set_target_dim(1)
converter.set_k(k)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('KernelLocallyLinearEmbedding')
converter_kernellocallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using Laplacian Eigenmaps
# algorithm as described in
#
# Belkin, M., & Niyogi, P. (2002).
# Laplacian Eigenmaps and Spectral Techniques for Embedding and Clustering.
# Science, 14, 585-591. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.9400&rep=rep1&type=pdf
#
# The number of neighbors for the kNN graph and the heat distribution
# coeffcient is set before processing the data
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_laplacianeigenmaps_modular (data_fname,k):
try:
from modshogun import RealFeatures, CSVFile
try:
from modshogun import LaplacianEigenmaps
except ImportError:
print("LaplacianEigenmaps not available")
exit(0)
features = RealFeatures(CSVFile(data_fname))
converter = LaplacianEigenmaps()
converter.set_target_dim(1)
converter.set_k(k)
converter.set_tau(20.0)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('LaplacianEigenmaps')
converter_laplacianeigenmaps_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_linearlocaltangentspacealignment_modular (data_fname,k):
try:
from modshogun import RealFeatures, CSVFile
try:
from modshogun import LinearLocalTangentSpaceAlignment
except ImportError:
print("LinearLocalTangentSpaceAlignment not available")
exit(0)
features = RealFeatures(CSVFile(data_fname))
converter = LinearLocalTangentSpaceAlignment()
converter.set_target_dim(1)
converter.set_k(k)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('LinearLocalTangentSpaceAlignment')
converter_linearlocaltangentspacealignment_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_localitypreservingprojections_modular (data_fname,k):
from modshogun import RealFeatures, CSVFile
from modshogun import LocalityPreservingProjections
features = RealFeatures(CSVFile(data_fname))
converter = LocalityPreservingProjections()
converter.set_target_dim(1)
converter.set_k(k)
converter.set_tau(2.0)
converter.apply(features)
return features
if __name__=='__main__':
print('LocalityPreservingProjections')
#converter_localitypreservingprojections_modular(*parameter_list[0])
# In this example toy data is being preprocessed using the Locally Linear Embedding (LLE)
# algorithm as described in
#
# Saul, L. K., Ave, P., Park, F., & Roweis, S. T. (2001).
# An Introduction to Locally Linear Embedding. Available from, 290(5500), 2323-2326.
# Retrieved from: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7319&rep=rep1&type=pdf
#
# The number of neighbors used during the linear reconstruction step of the algorithm is set
# before processing of the data.
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_locallylinearembedding_modular (data_fname,k):
try:
from modshogun import RealFeatures, CSVFile
try:
from modshogun import LocallyLinearEmbedding
except ImportError:
print("LocallyLinearEmbedding not available")
exit(0)
features = RealFeatures(CSVFile(data_fname))
converter = LocallyLinearEmbedding()
converter.set_target_dim(1)
converter.set_k(k)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('LocallyLinearEmbedding')
converter_locallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using the Local Tangent Space
# Alignment (LTSA) algorithms as described in
#
# Zhang, Z., & Zha, H. (2002). Principal Manifolds
# and Nonlinear Dimension Reduction via Local Tangent Space Alignment.
# Journal of Shanghai University English Edition, 8(4), 406-424. SIAM.
# Retrieved from http://arxiv.org/abs/cs/0212008
#
# Before processing the number of neighbors for computing local tangent space
# is set
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]
def converter_localtangentspacealignment_modular (data_fname,k):
try:
from modshogun import RealFeatures, CSVFile
try:
from modshogun import LocalTangentSpaceAlignment
except ImportError:
print("LocalTangentSpaceAlignment not available")
exit(0)
features = RealFeatures(CSVFile(data_fname))
converter = LocalTangentSpaceAlignment()
converter.set_target_dim(1)
converter.set_k(k)
converter.apply(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('LocalTangentSpaceAlignment')
converter_localtangentspacealignment_modular(*parameter_list[0])
# In this example toy data is being processed using the multidimensional
# scaling as described on p.261 (Section 12.1) of
#
# Borg, I., & Groenen, P. J. F. (2005).
# Modern multidimensional scaling: Theory and applications. Springer.
#
# Before processing the landmark approximation is disabled.
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]
def converter_multidimensionalscaling_modular (data_fname):
try:
import numpy
from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile
features = RealFeatures(CSVFile(data_fname))
distance_before = EuclideanDistance()
distance_before.init(features,features)
converter = MultidimensionalScaling()
converter.set_target_dim(2)
converter.set_landmark(False)
embedding = converter.apply(features)
distance_after = EuclideanDistance()
distance_after.init(embedding,embedding)
distance_matrix_after = distance_after.get_distance_matrix()
distance_matrix_before = distance_before.get_distance_matrix()
return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('MultidimensionalScaling')
converter_multidimensionalscaling_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data, 20]]
def converter_stochasticproximityembedding_modular (data_fname, k):
try:
from modshogun import RealFeatures,StochasticProximityEmbedding, SPE_GLOBAL, SPE_LOCAL, CSVFile
features = RealFeatures(CSVFile(data_fname))
converter = StochasticProximityEmbedding()
converter.set_target_dim(1)
converter.set_nupdates(40)
# Embed with local strategy
converter.set_k(k)
converter.set_strategy(SPE_LOCAL)
converter.embed(features)
# Embed with global strategy
converter.set_strategy(SPE_GLOBAL)
converter.embed(features)
return features
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('StochasticProximityEmbedding')
converter_stochasticproximityembedding_modular(*parameter_list[0])
#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]
def converter_tdistributedstochasticneighborembedding_modular(data_fname, seed=1):
try:
from modshogun import RealFeatures, TDistributedStochasticNeighborEmbedding
from modshogun import Math_init_random, CSVFile
# reproducible results
Math_init_random(seed)
features = RealFeatures(CSVFile(data_fname))
converter = TDistributedStochasticNeighborEmbedding()
converter.set_target_dim(2)
embedding = converter.apply(features)
return embedding
except ImportError:
print('No Eigen3 available')
if __name__=='__main__':
print('TDistributedStochasticNeighborEmbedding')
converter_tdistributedstochasticneighborembedding_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
#
# Obviously, using the Bray Curtis distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_braycurtis_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, BrayCurtisDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=BrayCurtisDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('BrayCurtisDistance')
distance_braycurtis_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCanberraMetric.html.
#
# Obviously, using the Canberra distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_canberra_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, CanberraMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=CanberraMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('CanberaMetric')
distance_canberra_modular(*parameter_list[0])
# This example shows how to compute the Canberra Word Distance.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,3,0,False]]
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
from modshogun import StringCharFeatures, StringWordFeatures, DNA
from modshogun import SortWordString
from modshogun import CanberraWordDistance
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
distance=CanberraWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('CanberraWordDistance')
distance_canberraword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (maximum of absolute feature dimension differences) matrix is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (maximum of absolute feature dimension differences) matrix between these
# two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
#
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_chebyshew_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, ChebyshewMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=ChebyshewMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('ChebyshewMetric')
distance_chebyshew_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
#
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,],[traindat,testdat]]
def distance_chisquare_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, ChiSquareDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=ChiSquareDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('ChiSquareDistance')
distance_chisquare_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCosineDistance.html.
#
# Obviously, using the Cosine distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_cosine_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, CosineDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=CosineDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('CosineDistance')
distance_cosine_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
from modshogun import RealFeatures, MSG_DEBUG
numpy.random.seed(17)
traindat = numpy.random.random_sample((10,10))
testdat = numpy.random.random_sample((10,10))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]
def distance_director_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
try:
from modshogun import DirectorDistance
except ImportError:
print("recompile shogun with --enable-swig-directors")
return
class DirectorEuclideanDistance(DirectorDistance):
def __init__(self):
DirectorDistance.__init__(self, True)
def distance_function(self, idx_a, idx_b):
seq1 = self.get_lhs().get_feature_vector(idx_a)
seq2 = self.get_rhs().get_feature_vector(idx_b)
return numpy.linalg.norm(seq1-seq2)
from modshogun import EuclideanDistance
from modshogun import Time
feats_train=RealFeatures(fm_train_real)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_train.parallel.set_num_threads(1)
feats_test=RealFeatures(fm_test_real)
distance=EuclideanDistance()
distance.init(feats_train, feats_test)
ddistance=DirectorEuclideanDistance()
ddistance.init(feats_train, feats_test)
#print "dm_train"
t=Time()
dm_train=distance.get_distance_matrix()
#t1=t.cur_time_diff(True)
#print "ddm_train"
t=Time()
ddm_train=ddistance.get_distance_matrix()
#t2=t.cur_time_diff(True)
#print "dm_train", dm_train
#print "ddm_train", ddm_train
return dm_train, ddm_train
if __name__=='__main__':
print('DirectorEuclideanDistance')
distance_director_euclidean_modular(*parameter_list[0])
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,traindat],[traindat,testdat]]
def distance_euclidean_modular(train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('EuclideanDistance')
distance_euclidean_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a
# pairwise distance (shortest path on a sphere) matrix is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (shortest path on a sphere) matrix between these two data sets is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
#
# Obviously, using the Geodesic distance is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_geodesic_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, GeodesicMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=GeodesicMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('GeodesicMetric')
distance_geodesic_modular(*parameter_list[0])
# This example shows how to compute the Hamming Word Distance for string features.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_labels('../data/fm_test_real.dat')
parameter_list = [[traindna,testdna,testdat,4,0,False,False],
[traindna,testdna,testdat,3,0,False,False]]
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):
from modshogun import StringCharFeatures, StringWordFeatures, DNA
from modshogun import SortWordString
from modshogun import HammingWordDistance
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
distance=HammingWordDistance(feats_train, feats_train, use_sign)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('HammingWordDistance')
distance_hammingword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (divergence measure based on the Kullback-Leibler divergence) matrix
# is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (divergence measure based on the Kullback-Leibler divergence) matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CJensenMetric.html.
#
# Obviously, using the Jensen-Shannon distance/divergence is not limited to
# this showcase example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_jensen_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, JensenMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=JensenMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('JensenMetric')
distance_jensen_modular(*parameter_list[0])
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat, testdat]]
def distance_mahalanobis_modular (train_fname = traindat, test_fname = testdat):
from modshogun import RealFeatures, CSVFile
from modshogun import MahalanobisDistance
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance = MahalanobisDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('MahalanobisDistance')
distance_mahalanobis_modular(*parameter_list[0])
# This example shows how to compute the Manhatten Distance.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_manhatten_modular (train_fname,test_fname=testdat):
from modshogun import RealFeatures, ManhattanMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=ManhattanMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('ManhattanMetric')
distance_manhatten_modular(*parameter_list[0])
# This example shows how to compute the Manahattan Distance for string features.
#!/usr/bin/env python
traindna = '../data/fm_train_dna.dat'
testdna = '../data/fm_test_dna.dat'
parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,4,0,False]]
def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
from modshogun import StringCharFeatures, StringWordFeatures, DNA
from modshogun import SortWordString, ManhattanWordDistance, CSVFile
charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
distance=ManhattanWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return dm_train,dm_test
if __name__=='__main__':
print('ManhattanWordDistance')
distance_manhattenword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) and norm 'k' controls the processing of the given data points,
# where a pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
#
# Obviously, using the Minkowski metric is not limited to this showcase
# example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,3],[traindat,testdat,4]]
def distance_minkowski_modular (train_fname=traindat,test_fname=testdat,k=3):
from modshogun import RealFeatures, MinkowskiMetric, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=MinkowskiMetric(feats_train, feats_train, k)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('MinkowskiMetric')
distance_minkowski_modular(*parameter_list[0])
# In this example an squared euclidian distance is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_normsquared_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
distance.set_disable_sqrt(True)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('EuclideanDistance - NormSquared')
distance_normsquared_modular(*parameter_list[0])
# In this example a sparse euclidean distance is computed for sparse toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_sparseeuclidean_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, SparseRealFeatures, SparseEuclideanDistance, CSVFile
realfeat=RealFeatures(CSVFile(train_fname))
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(CSVFile(test_fname))
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
distance=SparseEuclideanDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('SparseEuclideanDistance')
distance_sparseeuclidean_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (extended Jaccard coefficient) matrix between these two data sets is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
#
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_tanimoto_modular (train_fname=traindat,test_fname=testdat):
from modshogun import RealFeatures, TanimotoDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=TanimotoDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print('TanimotoDistance')
distance_tanimoto_modular(*parameter_list[0])
# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]
def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
from modshogun import StringWordFeatures, StringCharFeatures, DNA
from modshogun import Histogram
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
histo=Histogram(feats)
histo.train()
histo.get_histogram()
num_examples=feats.get_num_vectors()
num_param=histo.get_num_model_parameters()
#for i in xrange(num_examples):
# for j in xrange(num_param):
# histo.get_log_derivative(j, i)
out_likelihood = histo.get_log_likelihood()
out_sample = histo.get_log_likelihood_sample()
return histo,out_sample,out_likelihood
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
print('Histogram')
distribution_histogram_modular(*parameter_list[0])
# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set. After calling the constructor of the HMM class specifying
# the number of states and transitions the model is trained. Via the Baum-Welch
# algorithm the optimal transition and emission probabilities are estimated. The
# best path, i.e. the path with highest probability given the model can then be
# calculated using get_best_path_state.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_cubes('../data/fm_train_cube.dat')
parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]]
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
from modshogun import StringWordFeatures, StringCharFeatures, CUBE
from modshogun import HMM, BW_NORMAL
charfeat=StringCharFeatures(CUBE)
charfeat.set_features(fm_cube)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=HMM(feats, N, M, pseudo)
hmm.train()
hmm.baum_welch_viterbi_train(BW_NORMAL)
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in range(num_examples):
for j in range(num_param):
hmm.get_log_derivative(j, i)
best_path=0
best_path_state=0
for i in range(num_examples):
best_path+=hmm.best_path(i)
for j in range(N):
best_path_state+=hmm.get_best_path_state(i, j)
lik_example = hmm.get_log_likelihood()
lik_sample = hmm.get_log_likelihood_sample()
return lik_example, lik_sample, hmm
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
print('HMM')
distribution_hmm_modular(*parameter_list[0])
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to
# the structure of the Markov chain it is very similar to a HMM with just one
# chain of connected hidden states - that is why we termed this linear HMM.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
from modshogun import StringWordFeatures, StringCharFeatures, DNA
from modshogun import LinearHMM
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=LinearHMM(feats)
hmm.train()
hmm.get_transition_probs()
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in range(num_examples):
for j in range(num_param):
hmm.get_log_derivative(j, i)
out_likelihood = hmm.get_log_likelihood()
out_sample = hmm.get_log_likelihood_sample()
return hmm,out_likelihood ,out_sample
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
distribution_linearhmm_modular(*parameter_list[0])
print('LinearHMM')
# In this example usage of the Positional PWM is shown
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
parameter_list = [[traindna,3],[traindna,4]]
def distribution_ppwm_modular (fm_dna=traindna, order=3):
from modshogun import StringByteFeatures, StringCharFeatures, DNA
from modshogun import PositionalPWM
from numpy import array,e,log,exp
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringByteFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, 0, False)
L=20
k=3
sigma = 1;
mu = 4
ppwm=PositionalPWM()
ppwm.set_sigma(sigma)
ppwm.set_mean(mu)
pwm=array([[0.0, 0.5, 0.1, 1.0],
[0.0, 0.5, 0.5, 0.0],
[1.0, 0.0, 0.4, 0.0],
[0.0, 0.0, 0.0, 0.0]]);
pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]])
ppwm.set_pwm(log(pwm))
#print(ppwm.get_pwm())
ppwm.compute_w(L)
w=ppwm.get_w()
#print(w)
#from pylab import *
#figure(1)
#pcolor(exp(w))
#pcolor(w)
#colorbar()
#figure(2)
ppwm.compute_scoring(1)
u=ppwm.get_scoring(0)
#pcolor(exp(u))
#show()
#ppwm=PositionalPWM(feats)
#ppwm.train()
#out_likelihood = histo.get_log_likelihood()
#out_sample = histo.get_log_likelihood_sample()
return w,u
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
print('PositionalPWM')
distribution_ppwm_modular(*parameter_list[0])
# Example on how to evaluate the clustering performance (given ground-truth)
#!/usr/bin/env python
def get_dataset():
from os.path import exists
filename = "../../../data/uci/optdigits/optdigits.tes"
if exists(filename):
return open(filename)
else:
# print("Retrieving data...")
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
return urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")
def prepare_data():
from numpy import loadtxt
stream = get_dataset()
# print("Loading data...")
data = loadtxt(stream, delimiter=',')
fea = data[:, :-1]
gnd = data[:, -1]
return (fea.T, gnd)
(fea, gnd_raw) = prepare_data()
parameter_list = [[fea, gnd_raw, 10]]
def run_clustering(data, k):
from modshogun import KMeans
from modshogun import EuclideanDistance
from modshogun import RealFeatures
fea = RealFeatures(data)
distance = EuclideanDistance(fea, fea)
kmeans=KMeans(k, distance)
# print("Running clustering...")
kmeans.train()
return kmeans.get_cluster_centers()
def assign_labels(data, centroids, ncenters):
from modshogun import EuclideanDistance
from modshogun import RealFeatures, MulticlassLabels
from modshogun import KNN
from numpy import arange
labels = MulticlassLabels(arange(0.,ncenters))
fea = RealFeatures(data)
fea_centroids = RealFeatures(centroids)
distance = EuclideanDistance(fea_centroids, fea_centroids)
knn = KNN(1, distance, labels)
knn.train()
return knn.apply(fea)
def evaluation_clustering (features=fea, ground_truth=gnd_raw, ncenters=10):
from modshogun import ClusteringAccuracy, ClusteringMutualInformation
from modshogun import MulticlassLabels
from modshogun import Math
# reproducable results
Math.init_random(1)
centroids = run_clustering(features, ncenters)
gnd_hat = assign_labels(features, centroids, ncenters)
gnd = MulticlassLabels(ground_truth)
AccuracyEval = ClusteringAccuracy()
AccuracyEval.best_map(gnd_hat, gnd)
accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
#print(('Clustering accuracy = %.4f' % accuracy))
MIEval = ClusteringMutualInformation()
mutual_info = MIEval.evaluate(gnd_hat, gnd)
#print(('Clustering mutual information = %.4f' % mutual_info))
# TODO mutual information does not work with serialization
#return gnd, gnd_hat, accuracy, MIEval, mutual_info
return gnd, gnd_hat, accuracy
if __name__ == '__main__':
print('Evaluation Clustering')
evaluation_clustering(*parameter_list[0])
#!/usr/bin/env python
parameter_list = [[1000,2,8],[1000,4,8]]
from numpy import *
#from pylab import *
def run_clustering(data, k):
from modshogun import KMeans
from modshogun import Math_init_random
from modshogun import EuclideanDistance
from modshogun import RealFeatures
fea = RealFeatures(data)
distance = EuclideanDistance(fea, fea)
kmeans=KMeans(k, distance)
#print("Running clustering...")
kmeans.train()
return kmeans.get_cluster_centers()
def assign_labels(data, centroids, ncenters):
from modshogun import EuclideanDistance
from modshogun import RealFeatures, MulticlassLabels
from modshogun import KNN
from numpy import arange
labels = MulticlassLabels(arange(0.,ncenters))
fea = RealFeatures(data)
fea_centroids = RealFeatures(centroids)
distance = EuclideanDistance(fea_centroids, fea_centroids)
knn = KNN(1, distance, labels)
knn.train()
return knn.apply(fea)
def evaluation_clustering_simple (n_data=100, sqrt_num_blobs=4, distance=5):
from modshogun import ClusteringAccuracy, ClusteringMutualInformation
from modshogun import MulticlassLabels, GaussianBlobsDataGenerator
from modshogun import Math
# reproducable results
Math.init_random(1)
# produce sone Gaussian blobs to cluster
ncenters=sqrt_num_blobs**2
stretch=1
angle=1
gen=GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle)
features=gen.get_streamed_features(n_data)
X=features.get_feature_matrix()
# compute approximate "ground truth" labels via taking the closest blob mean
coords=array(range(0,sqrt_num_blobs*distance,distance))
idx_0=[abs(coords -x).argmin() for x in X[0]]
idx_1=[abs(coords -x).argmin() for x in X[1]]
ground_truth=array([idx_0[i]*sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64")
#for label in unique(ground_truth):
# indices=ground_truth==label
# plot(X[0][indices], X[1][indices], 'o')
#show()
centroids = run_clustering(features, ncenters)
gnd_hat = assign_labels(features, centroids, ncenters)
gnd = MulticlassLabels(ground_truth)
AccuracyEval = ClusteringAccuracy()
AccuracyEval.best_map(gnd_hat, gnd)
accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
# in this case we know that the clustering has to be very good
#print(('Clustering accuracy = %.4f' % accuracy))
assert(accuracy>0.8)
MIEval = ClusteringMutualInformation()
mutual_info = MIEval.evaluate(gnd_hat, gnd)
#print(('Clustering mutual information = %.4f' % mutual_info))
return gnd, accuracy, mutual_info
if __name__ == '__main__':
print('Evaluation Clustering')
evaluation_clustering_simple(*parameter_list[0])
# In this example various (accuracy, error rate, ..) measures are being computed
# for the pair of ground truth toy data and random data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))
parameter_list = [[ground_truth,predicted]]
def evaluation_contingencytableevaluation_modular (ground_truth, predicted):
from modshogun import BinaryLabels
from modshogun import ContingencyTableEvaluation
from modshogun import AccuracyMeasure,ErrorRateMeasure,BALMeasure
from modshogun import WRACCMeasure,F1Measure,CrossCorrelationMeasure
from modshogun import RecallMeasure,PrecisionMeasure,SpecificityMeasure
ground_truth_labels = BinaryLabels(ground_truth)
predicted_labels = BinaryLabels(predicted)
base_evaluator = ContingencyTableEvaluation()
base_evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = AccuracyMeasure()
accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = ErrorRateMeasure()
errorrate = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = BALMeasure()
bal = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = WRACCMeasure()
wracc = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = F1Measure()
f1 = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = CrossCorrelationMeasure()
crosscorrelation = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = RecallMeasure()
recall = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = PrecisionMeasure()
precision = evaluator.evaluate(predicted_labels,ground_truth_labels)
evaluator = SpecificityMeasure()
specificity = evaluator.evaluate(predicted_labels,ground_truth_labels)
return accuracy, errorrate, bal, wracc, f1, crosscorrelation, recall, precision, specificity
if __name__=='__main__':
print('EvaluationContingencyTableEvaluation')
evaluation_contingencytableevaluation_modular(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
from numpy.random import randn
from numpy import *
# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));
parameter_list = [[traindat,label_traindat]]
def evaluation_cross_validation_classification (traindat=traindat, label_traindat=label_traindat):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import BinaryLabels
from modshogun import RealFeatures
from modshogun import LibLinear, L2R_L2LOSS_SVC
# training data
features=RealFeatures(traindat)
labels=BinaryLabels(label_traindat)
# classifier
classifier=LibLinear(L2R_L2LOSS_SVC)
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but the standard
# "CrossValidationSplitting" is also available
splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)
# evaluation method
evaluation_criterium=ContingencyTableEvaluation(ACCURACY)
# cross-validation instance
cross_validation=CrossValidation(classifier, features, labels,
splitting_strategy, evaluation_criterium)
cross_validation.set_autolock(False)
# (optional) repeat x-val 10 times
cross_validation.set_num_runs(10)
# perform cross-validation and print(results)
result=cross_validation.evaluate()
#print("mean:", result.mean)
if __name__=='__main__':
print('Evaluation CrossValidationClassification')
evaluation_cross_validation_classification(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
from numpy.random import randn
from numpy import *
# generate some overlapping training vectors
num_vectors=5
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));
parameter_list = [[traindat,label_traindat]]
def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import CrossValidationPrintOutput
from modshogun import CrossValidationMKLStorage
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import BinaryLabels
from modshogun import RealFeatures, CombinedFeatures
from modshogun import GaussianKernel, CombinedKernel
from modshogun import LibSVM, MKLClassification
# training data, combined features all on same data
features=RealFeatures(traindat)
comb_features=CombinedFeatures()
comb_features.append_feature_obj(features)
comb_features.append_feature_obj(features)
comb_features.append_feature_obj(features)
labels=BinaryLabels(label_traindat)
# kernel, different Gaussians combined
kernel=CombinedKernel()
kernel.append_kernel(GaussianKernel(10, 0.1))
kernel.append_kernel(GaussianKernel(10, 1))
kernel.append_kernel(GaussianKernel(10, 2))
# create mkl using libsvm, due to a mem-bug, interleaved is not possible
svm=MKLClassification(LibSVM());
svm.set_interleaved_optimization_enabled(False);
svm.set_kernel(kernel);
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but the standard
# "StratifiedCrossValidationSplitting" is also available
splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)
# evaluation method
evaluation_criterium=ContingencyTableEvaluation(ACCURACY)
# cross-validation instance
cross_validation=CrossValidation(svm, comb_features, labels,
splitting_strategy, evaluation_criterium)
cross_validation.set_autolock(False)
# append cross vlaidation output classes
#cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
mkl_storage=CrossValidationMKLStorage()
cross_validation.add_cross_validation_output(mkl_storage)
cross_validation.set_num_runs(3)
# perform cross-validation
result=cross_validation.evaluate()
# print mkl weights
weights=mkl_storage.get_mkl_weights()
#print "mkl weights during cross--validation"
#print weights
if __name__=='__main__':
print('Evaluation CrossValidationClassification')
evaluation_cross_validation_mkl_weight_storage(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
from numpy.random import randn, seed
from numpy import *
# generate some overlapping training vectors
seed(1)
num_vectors=7
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((zeros(num_vectors), ones(num_vectors)));
parameter_list = [[traindat,label_traindat]]
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import CrossValidationPrintOutput
from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage
from modshogun import MulticlassAccuracy, F1Measure
from modshogun import StratifiedCrossValidationSplitting
from modshogun import MulticlassLabels
from modshogun import RealFeatures, CombinedFeatures
from modshogun import GaussianKernel, CombinedKernel
from modshogun import MKLMulticlass
from modshogun import Statistics, MSG_DEBUG, Math
Math.init_random(1)
# training data, combined features all on same data
features=RealFeatures(traindat)
comb_features=CombinedFeatures()
comb_features.append_feature_obj(features)
comb_features.append_feature_obj(features)
comb_features.append_feature_obj(features)
labels=MulticlassLabels(label_traindat)
# kernel, different Gaussians combined
kernel=CombinedKernel()
kernel.append_kernel(GaussianKernel(10, 0.1))
kernel.append_kernel(GaussianKernel(10, 1))
kernel.append_kernel(GaussianKernel(10, 2))
# create mkl using libsvm, due to a mem-bug, interleaved is not possible
svm=MKLMulticlass(1.0,kernel,labels);
svm.set_kernel(kernel);
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but the standard
# "StratifiedCrossValidationSplitting" is also available
splitting_strategy=StratifiedCrossValidationSplitting(labels, 3)
# evaluation method
evaluation_criterium=MulticlassAccuracy()
# cross-validation instance
cross_validation=CrossValidation(svm, comb_features, labels,
splitting_strategy, evaluation_criterium)
cross_validation.set_autolock(False)
# append cross vlaidation output classes
#cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
#mkl_storage=CrossValidationMKLStorage()
#cross_validation.add_cross_validation_output(mkl_storage)
multiclass_storage=CrossValidationMulticlassStorage()
multiclass_storage.append_binary_evaluation(F1Measure())
cross_validation.add_cross_validation_output(multiclass_storage)
cross_validation.set_num_runs(3)
# perform cross-validation
result=cross_validation.evaluate()
roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0)
#print roc_0_0_0
auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0)
#print auc_0_0_0
return roc_0_0_0, auc_0_0_0
if __name__=='__main__':
print('Evaluation CrossValidationMulticlassStorage')
evaluation_cross_validation_multiclass_storage(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,label_traindat,0.8,1e-6],[traindat,label_traindat,0.9,1e-7]]
def evaluation_cross_validation_regression (train_fname=traindat,label_fname=label_traindat,width=0.8,tau=1e-6):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import MeanSquaredError, CrossValidationSplitting
from modshogun import RegressionLabels, RealFeatures
from modshogun import GaussianKernel, KernelRidgeRegression, CSVFile
# training data
features=RealFeatures(CSVFile(train_fname))
labels=RegressionLabels(CSVFile(label_fname))
# kernel and predictor
kernel=GaussianKernel()
predictor=KernelRidgeRegression(tau, kernel, labels)
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but here, the std x-val is used
splitting_strategy=CrossValidationSplitting(labels, 5)
# evaluation method
evaluation_criterium=MeanSquaredError()
# cross-validation instance
cross_validation=CrossValidation(predictor, features, labels,
splitting_strategy, evaluation_criterium)
# (optional) repeat x-val 10 times
cross_validation.set_num_runs(10)
# (optional) tell machine to precompute kernel matrix. speeds up. may not work
predictor.data_lock(labels, features)
# perform cross-validation and print(results)
result=cross_validation.evaluate()
#print("mean:", result.mean)
if __name__=='__main__':
print('Evaluation CrossValidationClassification')
evaluation_cross_validation_regression(*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))
parameter_list = [[ground_truth,predicted]]
def evaluation_director_contingencytableevaluation_modular (ground_truth, predicted):
try:
from modshogun import DirectorContingencyTableEvaluation, ED_MAXIMIZE
except ImportError:
print("recompile shogun with --enable-swig-directors")
return
class SimpleWeightedBinaryEvaluator(DirectorContingencyTableEvaluation):
def __init__(self):
DirectorContingencyTableEvaluation.__init__(self)
def get_custom_direction(self):
return ED_MAXIMIZE
def get_custom_score(self):
return self.get_WRACC()+self.get_BAL()
from modshogun import BinaryLabels
evaluator = SimpleWeightedBinaryEvaluator()
r = evaluator.evaluate(BinaryLabels(ground_truth), BinaryLabels(predicted))
r2 = evaluator.get_custom_score()
print(r,r2)
return r,r2
if __name__=='__main__':
print('EvaluationDirectorContingencyTableEvaluation')
evaluation_director_contingencytableevaluation_modular(*parameter_list[0])
# In this example a mean squared error (MSE) is being computed
# for the pair of random vectors of length N.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
N = 100
random.seed(17)
ground_truth = random.randn(N)
predicted = random.randn(N)
parameter_list = [[ground_truth,predicted]]
def evaluation_meansquarederror_modular (ground_truth, predicted):
from modshogun import RegressionLabels
from modshogun import MeanSquaredError
ground_truth_labels = RegressionLabels(ground_truth)
predicted_labels = RegressionLabels(predicted)
evaluator = MeanSquaredError()
mse = evaluator.evaluate(predicted_labels,ground_truth_labels)
return mse
if __name__=='__main__':
print('MeanSquaredError')
evaluation_meansquarederror_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
N = 100
random.seed(17)
ground_truth = abs(random.randn(N))
predicted = abs(random.randn(N))
parameter_list = [[ground_truth,predicted]]
def evaluation_meansquaredlogerror_modular (ground_truth, predicted):
from modshogun import RegressionLabels
from modshogun import MeanSquaredLogError
ground_truth_labels = RegressionLabels(ground_truth)
predicted_labels = RegressionLabels(predicted)
evaluator = MeanSquaredLogError()
mse = evaluator.evaluate(predicted_labels,ground_truth_labels)
return mse
if __name__=='__main__':
print('EvaluationMeanSquaredLogError')
evaluation_meansquaredlogerror_modular(*parameter_list[0])
# In this example a multiclass accuracy is being computed for toy data labels
# and toy data labels multiplied by two.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
random.seed(17)
ground_truth = lm.load_labels('../data/label_train_multiclass.dat')
predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2
parameter_list = [[ground_truth,predicted]]
def evaluation_multiclassaccuracy_modular (ground_truth, predicted):
from modshogun import MulticlassLabels
from modshogun import MulticlassAccuracy
ground_truth_labels = MulticlassLabels(ground_truth)
predicted_labels = MulticlassLabels(predicted)
evaluator = MulticlassAccuracy()
accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
return accuracy
if __name__=='__main__':
print('MulticlassAccuracy')
evaluation_multiclassaccuracy_modular(*parameter_list[0])
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
parameter_list = [[traindat, label_traindat]]
def evaluation_multiclassovrevaluation_modular(train_fname=traindat, label_fname=label_traindat):
from modshogun import MulticlassOVREvaluation,ROCEvaluation
from modshogun import MulticlassLibLinear,RealFeatures,ContingencyTableEvaluation,ACCURACY
from modshogun import MulticlassLabels, Math, CSVFile
Math.init_random(1)
ground_truth_labels = MulticlassLabels(CSVFile(label_fname))
svm = MulticlassLibLinear(1.0,RealFeatures(CSVFile(train_fname)),ground_truth_labels)
svm.parallel.set_num_threads(1)
svm.train()
predicted_labels = svm.apply()
binary_evaluator = ROCEvaluation()
evaluator = MulticlassOVREvaluation(binary_evaluator)
mean_roc = evaluator.evaluate(predicted_labels,ground_truth_labels)
#print mean_roc
binary_evaluator = ContingencyTableEvaluation(ACCURACY)
evaluator = MulticlassOVREvaluation(binary_evaluator)
mean_accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
#print mean_accuracy
return mean_roc, mean_accuracy, predicted_labels, svm
if __name__=='__main__':
print('MulticlassOVREvaluation')
evaluation_multiclassovrevaluation_modular(*parameter_list[0])
# In this example PRC (Precision-Recall curve) is being computed
# for the pair of ground truth toy labels and random labels.
# PRC curve (as matrix) and auPRC (area under PRC) is returned.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))
parameter_list = [[ground_truth,predicted]]
def evaluation_prcevaluation_modular (ground_truth, predicted):
from modshogun import BinaryLabels
from modshogun import PRCEvaluation
ground_truth_labels = BinaryLabels(ground_truth)
predicted_labels = BinaryLabels(predicted)
evaluator = PRCEvaluation()
evaluator.evaluate(predicted_labels,ground_truth_labels)
return evaluator.get_PRC(), evaluator.get_auPRC()
if __name__=='__main__':
print('PRCEvaluation')
evaluation_prcevaluation_modular(*parameter_list[0])
# In this example ROC (Receiver Operator Characteristic) is being computed
# for the pair of ground truth toy labels and random labels.
# ROC curve (as matrix) and auROC (area under ROC) is returned.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()
ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))
parameter_list = [[ground_truth,predicted]]
def evaluation_rocevaluation_modular (ground_truth, predicted):
from modshogun import BinaryLabels
from modshogun import ROCEvaluation
ground_truth_labels = BinaryLabels(ground_truth)
predicted_labels = BinaryLabels(predicted)
evaluator = ROCEvaluation()
evaluator.evaluate(predicted_labels,ground_truth_labels)
return evaluator.get_ROC(), evaluator.get_auROC()
if __name__=='__main__':
print('ROCEvaluation')
evaluation_rocevaluation_modular(*parameter_list[0])
#!/usr/bin/env python
parameter_list = [[1000]]
def evaluation_thresholds_modular (index):
from modshogun import BinaryLabels, ROCEvaluation
import numpy
numpy.random.seed(17)
output=numpy.arange(-1,1,0.001)
output=(0.3*output+0.7*(numpy.random.rand(len(output))-0.5))
label=[-1.0]*(len(output)//2)
label.extend([1.0]*(len(output)//2))
label=numpy.array(label)
pred=BinaryLabels(output)
truth=BinaryLabels(label)
evaluator=ROCEvaluation()
evaluator.evaluate(pred, truth)
[fp,tp]=evaluator.get_ROC()
thresh=evaluator.get_thresholds()
b=thresh[index]
#print("tpr", numpy.mean(output[label>0]>b), tp[index])
#print("fpr", numpy.mean(output[label<0]>b), fp[index])
return tp[index],fp[index],numpy.mean(output[label>0]>b),numpy.mean(output[label<0]>b)
if __name__=='__main__':
print('Evaluation with Thresholds')
evaluation_thresholds_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
matrix=numpy.array([[-1.0,0,1],[2,3,4],[5,6,7]])
bins=numpy.array([[0.0, 0.0, 0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[4.0,4.0,4.0]])
parameter_list = [(matrix,bins)]
def features_binned_dot_modular (matrix, bins):
from modshogun import RealFeatures, BinnedDotFeatures
rf=RealFeatures(matrix)
#print(rf.get_feature_matrix())
bf=BinnedDotFeatures(rf, bins)
filled=bf.get_computed_dot_feature_matrix()
bf.set_fill(False)
unfilled=bf.get_computed_dot_feature_matrix()
bf.set_norm_one(True)
unfilled_normed=bf.get_computed_dot_feature_matrix()
bf.set_fill(True)
filled_normed=bf.get_computed_dot_feature_matrix()
return bf,filled,unfilled,unfilled_normed,filled_normed
if __name__=='__main__':
print('BinnedDotFeatures')
features_binned_dot_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
# create dense matrix A
A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.uint8)
parameter_list=[[A]]
def features_dense_byte_modular (A):
from modshogun import ByteFeatures
# create dense features a
# ... of type Byte
a=ByteFeatures(A)
# print(some statistics about a)
#print(a.get_num_vectors())
#print(a.get_num_features())
# get first feature vector and set it
#print(a.get_feature_vector(0))
a.set_feature_vector(numpy.array([1,4,0,0,0,9], dtype=numpy.uint8), 0)
# get matrix
a_out = a.get_feature_matrix()
#print(type(a_out), a_out.dtype)
#print(a_out )
assert(numpy.all(a_out==A))
return a_out,a
if __name__=='__main__':
print('ByteFeatures')
features_dense_byte_modular(*parameter_list[0])
#!/usr/bin/env python
parameter_list=[[]]
def features_dense_io_modular():
from modshogun import RealFeatures, CSVFile
feats=RealFeatures()
f=CSVFile("../data/fm_train_real.dat","r")
f.set_delimiter(" ")
feats.load(f)
return feats
if __name__=='__main__':
print('Dense Real Features IO')
features_dense_io_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import LongIntFeatures
from numpy import array, int64, all
# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
parameter_list = [[matrix]]
# ... of type LongInt
def features_dense_longint_modular (A=matrix):
a=LongIntFeatures(A)
# get first feature vector and set it
a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0)
# get matrix
a_out = a.get_feature_matrix()
assert(all(a_out==A))
return a_out
if __name__=='__main__':
print('dense_longint')
features_dense_longint_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import RealFeatures, LongIntFeatures, ByteFeatures
from numpy import array, float64, int64, uint8, all
# create dense matrices A,B,C
matrixA=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
matrixB=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
matrixC=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8)
# ... of type Real, LongInt and Byte
parameter_list = [[matrixA,matrixB,matrixC]]
def features_dense_modular (A=matrixA,B=matrixB,C=matrixC):
a=RealFeatures(A)
b=LongIntFeatures(B)
c=ByteFeatures(C)
# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))
# print(some statistics about a)
# get first feature vector and set it
a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)
# get matrices
a_out = a.get_feature_matrix()
b_out = b.get_feature_matrix()
c_out = c.get_feature_matrix()
assert(all(a_out==A))
assert(all(b_out==B))
assert(all(c_out==C))
return a_out,b_out,c_out,a,b,c
if __name__=='__main__':
print('dense')
features_dense_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
from modshogun import RealFeatures
from modshogun import LongIntFeatures
from numpy import array, float64, int64
# create dense matrice
data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]]
parameter_list = [[data]]
def features_dense_protocols_modular (in_data=data):
m_real=array(in_data, dtype=float64, order='F')
f_real=RealFeatures(m_real)
#print m_real
#print f_real
#print f_real[-1]
#print f_real[1, 2]
#print f_real[-1:3]
#print f_real[2, 0:2]
#print f_real[0:3, 1]
#print f_real[0:3, 1:2]
#print f_real[:,1]
#print f_real[1,:]
#print m_real[-2]
f_real[-1]=m_real[-2]
#print f_real[-1]
#print m_real[0, 1]
f_real[1,2]=m_real[0,1]
#print f_real[1, 2]
#print m_real[0:2]
f_real[1:3]=m_real[0:2]
#print f_real[1:3]
#print m_real[0, 0:2]
f_real[2, 0:2]=m_real[0,0:2]
#print f_real[2, 0:2]
#print m_real[0:3, 2]
f_real[0:3,1]=m_real[0:3, 2]
#print f_real[0:3, 1]
#print m_real[0:3, 0:1]
f_real[0:3,1:2]=m_real[0:3,0:1]
#print f_real[0:3, 1:2]
f_real[:,0]=0
#print f_real.get_feature_matrix()
if numpy.__version__ >= '1.5':
f_real+=m_real
f_real*=m_real
f_real-=m_real
else:
print("numpy version >= 1.5 is needed")
return None
f_real+=f_real
f_real*=f_real
f_real-=f_real
#print f_real
#print f_real.get_feature_matrix()
try:
mem_real=memoryview(f_real)
except NameError:
print("Python2.7 and later is needed for memoryview class")
return
ret_real=array(f_real)
#print ret_real
return f_real[:,0]
if __name__=='__main__':
print('dense_protocols')
features_dense_protocols_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import RealFeatures
from numpy import array, float64, all
# create dense matrices A,B,C
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
parameter_list = [[matrix]]
# ... of type LongInt
def features_dense_real_modular (A=matrix):
# ... of type Real, LongInt and Byte
a=RealFeatures(A)
# print(some statistics about a)
#print(a.get_num_vectors())
#print(a.get_num_features())
# get first feature vector and set it
#print(a.get_feature_vector(0))
a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)
# get matrix
a_out = a.get_feature_matrix()
assert(all(a_out==A))
return a_out
if __name__=='__main__':
print('dense_real')
features_dense_real_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
from modshogun import RealFeatures
from numpy import array, float64, int64
# create dense matrice
data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]]
parameter_list = [[data]]
def features_dense_zero_copy_modular (in_data=data):
feats = None
if numpy.__version__ >= '1.5':
feats=numpy.array(in_data, dtype=float64, order='F')
a=RealFeatures()
a.frombuffer(feats, False)
b=numpy.array(a, copy=False)
c=numpy.array(a, copy=True)
d=RealFeatures()
d.frombuffer(a, False)
e=RealFeatures()
e.frombuffer(a, True)
a[:,0]=0
#print a[0:4]
#print b[0:4]
#print c[0:4]
#print d[0:4]
#print e[0:4]
else:
print("numpy version >= 1.5 is needed")
return feats
if __name__=='__main__':
print('dense_zero_copy')
features_dense_zero_copy_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-3],[traindat,testdat,label_traindat,0.8,1e-2]]
def features_director_dot_modular (fm_train_real, fm_test_real,
label_train_twoclass, C, epsilon):
try:
from modshogun import DirectorDotFeatures
from modshogun import RealVector
except ImportError:
print("recompile shogun with --enable-swig-directors")
return
class NumpyFeatures(DirectorDotFeatures):
# variables
data=numpy.empty((1,1))
# constructor
def __init__(self, d):
DirectorDotFeatures.__init__(self)
self.data = d
# overloaded methods
def add_to_dense_sgvec(self, alpha, vec_idx1, vec2, abs):
if abs:
vec2+=alpha*numpy.abs(self.data[:,vec_idx1])
else:
vec2+=alpha*self.data[:,vec_idx1]
def dot(self, vec_idx1, df, vec_idx2):
return numpy.dot(self.data[:,vec_idx1], df.get_computed_dot_feature_vector(vec_idx2))
def dense_dot_sgvec(self, vec_idx1, vec2):
return numpy.dot(self.data[:,vec_idx1], vec2[0:vec2.vlen])
def get_num_vectors(self):
return self.data.shape[1]
def get_dim_feature_space(self):
return self.data.shape[0]
# operators
# def __add__(self, other):
# return NumpyFeatures(self.data+other.data)
# def __sub__(self, other):
# return NumpyFeatures(self.data-other.data)
# def __iadd__(self, other):
# return NumpyFeatures(self.data+other.data)
# def __isub__(self, other):
# return NumpyFeatures(self.data-other.data)
#from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
#from modshogun import LibLinear, L2R_L2LOSS_SVC_DUAL
#from modshogun import Math_init_random
#Math_init_random(17)
#feats_train=RealFeatures(fm_train_real)
#feats_test=RealFeatures(fm_test_real)
#labels=BinaryLabels(label_train_twoclass)
#dfeats_train=NumpyFeatures(fm_train_real)
#dfeats_test=NumpyFeatures(fm_test_real)
#dlabels=BinaryLabels(label_train_twoclass)
#print feats_train.get_computed_dot_feature_matrix()
#print dfeats_train.get_computed_dot_feature_matrix()
#svm=LibLinear(C, feats_train, labels)
#svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
#svm.set_epsilon(epsilon)
#svm.set_bias_enabled(True)
#svm.train()
#svm.set_features(feats_test)
#svm.apply().get_labels()
#predictions = svm.apply()
#dfeats_train.__disown__()
#dfeats_train.parallel.set_num_threads(1)
#dsvm=LibLinear(C, dfeats_train, dlabels)
#dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
#dsvm.set_epsilon(epsilon)
#dsvm.set_bias_enabled(True)
#dsvm.train()
#dfeats_test.__disown__()
#dfeats_test.parallel.set_num_threads(1)
#dsvm.set_features(dfeats_test)
#dsvm.apply().get_labels()
#dpredictions = dsvm.apply()
#return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print('DirectorLinear')
features_director_dot_modular(*parameter_list[0])
#!/usr/bin/env python
strings=['hey','guys','i','am','a','string']
parameter_list=[[strings]]
def features_hasheddocdot_modular(strings):
from modshogun import StringCharFeatures, RAWBYTE
from modshogun import HashedDocDotFeatures
from modshogun import NGramTokenizer
from numpy import array
#create string features
f=StringCharFeatures(strings, RAWBYTE)
#set the number of bits of the target dimension
#means a dim of size 2^5=32
num_bits=5
#create the ngram tokenizer of size 8 to parse the strings
tokenizer=NGramTokenizer(8)
#normalize results
normalize=True
#create HashedDocDot features
hddf=HashedDocDotFeatures(num_bits, f, tokenizer, normalize)
#should expect 32
#print('Feature space dimensionality is', hddf.get_dim_feature_space())
#print('Self dot product of string 0', hddf.dot(0, hddf, 0))
return hddf
if __name__=='__main__':
print('HashedDocDotFeatures')
features_hasheddocdot_modular(*parameter_list[0])
# This example shows how to read and write plain ascii files, binary files and
# hdf5 datasets.
#
# For ascii files it shows how to obtain shogun's RealFeatures
# (a simple feature matrix of doubles with 1 column == 1 example, nr_columns ==
# number of examples) and also sparse features in SVM light format.
#
# Binary files use some custom native format and datasets can be read/written
# from/to hdf5 files with arbitrary group / path.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_numbers('../data/fm_train_real.dat')
label=lm.load_numbers('../data/label_train_twoclass.dat')
parameter_list=[[data,label]]
def features_io_modular (fm_train_real, label_train_twoclass):
import numpy
from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels
from modshogun import GaussianKernel
from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File
feats=SparseRealFeatures(fm_train_real)
feats2=SparseRealFeatures()
f=BinaryFile("tmp/fm_train_sparsereal.bin","w")
feats.save(f)
f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w")
feats.save(f)
f=BinaryFile("tmp/fm_train_sparsereal.bin")
feats2.load(f)
f=LibSVMFile("tmp/fm_train_sparsereal.ascii")
feats2.load(f)
feats=RealFeatures(fm_train_real)
feats2=RealFeatures()
f=BinaryFile("tmp/fm_train_real.bin","w")
feats.save(f)
f=HDF5File("tmp/fm_train_real.h5","w", "/data/doubles")
feats.save(f)
f=CSVFile("tmp/fm_train_real.ascii","w")
feats.save(f)
f=BinaryFile("tmp/fm_train_real.bin")
feats2.load(f)
#print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))
f=CSVFile("tmp/fm_train_real.ascii")
feats2.load(f)
#print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))
lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0]))
lab2=MulticlassLabels()
f=CSVFile("tmp/label_train_twoclass.ascii","w")
lab.save(f)
f=BinaryFile("tmp/label_train_twoclass.bin","w")
lab.save(f)
f=HDF5File("tmp/label_train_real.h5","w", "/data/labels")
lab.save(f)
f=CSVFile("tmp/label_train_twoclass.ascii")
lab2.load(f)
f=BinaryFile("tmp/label_train_twoclass.bin")
lab2.load(f)
f=HDF5File("tmp/fm_train_real.h5","r", "/data/doubles")
feats2.load(f)
#print(feats2.get_feature_matrix())
f=HDF5File("tmp/label_train_real.h5","r", "/data/labels")
lab2.load(f)
#print(lab2.get_labels())
#clean up
import os
for f in ['tmp/fm_train_sparsereal.bin','tmp/fm_train_sparsereal.ascii',
'tmp/fm_train_real.bin','tmp/fm_train_real.h5','tmp/fm_train_real.ascii',
'tmp/label_train_real.h5', 'tmp/label_train_twoclass.ascii','tmp/label_train_twoclass.bin']:
os.unlink(f)
return feats, feats2, lab, lab2
if __name__=='__main__':
print('Features IO')
features_io_modular(*parameter_list[0])
# This example demonstrates how to read and write data in the SVMLight Format
# from Shogun.
#
#!/usr/bin/env python
parameter_list=[['../data/train_sparsereal.light']]
def features_read_svmlight_format_modular (fname):
import os
from modshogun import SparseRealFeatures
from modshogun import LibSVMFile
f=SparseRealFeatures()
lab=f.load_with_labels(LibSVMFile(fname))
f.save_with_labels(LibSVMFile('tmp/testwrite.light', 'w'), lab)
os.unlink('tmp/testwrite.light')
if __name__=='__main__':
print('Reading SVMLIGHT format')
features_read_svmlight_format_modular(*parameter_list[0])
# Creates features similar to the feature space of the SNP kernel. Useful when
# working with linear methods.
#!/usr/bin/env python
parameter_list=[['../data/snps.dat']]
def features_snp_modular (fname):
from modshogun import StringByteFeatures, SNPFeatures, SNP
sf=StringByteFeatures(SNP)
sf.load_ascii_file(fname, False, SNP, SNP)
#print(sf.get_features())
snps=SNPFeatures(sf)
#print(snps.get_feature_matrix())
#print(snps.get_minor_base_string())
#print(snps.get_major_base_string())
if __name__=='__main__':
print('SNP Features')
features_snp_modular(*parameter_list[0])
# This example demsonstrates how to encode sparse (most entries zero),
# real-valued features in shogun using SparseRealFeatures.
#!/usr/bin/env python
import numpy
# create dense matrix A
A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.float64)
parameter_list=[[A]]
def features_sparse_modular (A):
from scipy.sparse import csc_matrix
from modshogun import SparseRealFeatures
from numpy import array, float64, all
# sparse representation X of dense matrix A
# note, will work with types other than float64 too,
# but requires recent scipy.sparse
X=csc_matrix(A)
#print(A)
# create sparse shogun features from dense matrix A
a=SparseRealFeatures(A)
a_out=a.get_full_feature_matrix()
#print(a_out)
assert(all(a_out==A))
#print(a_out)
# create sparse shogun features from sparse matrix X
a.set_sparse_feature_matrix(X)
a_out=a.get_full_feature_matrix()
#print(a_out)
assert(all(a_out==A))
# create sparse shogun features from sparse matrix X
a=SparseRealFeatures(X)
a_out=a.get_full_feature_matrix()
#print(a_out)
assert(all(a_out==A))
# obtain (data,row,indptr) csc arrays of sparse shogun features
z=csc_matrix(a.get_sparse_feature_matrix())
z_out=z.todense()
#print(z_out)
assert(all(z_out==A))
if __name__=='__main__':
print('Sparse Features')
features_sparse_modular(*parameter_list[0])
# This example demonstrates how to use compressed strings with shogun.
# We currently support reading and writing compressed files using
# LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract
# compressed streams on-the-fly in order to fit data sets into
# memory that would be too large, otherwise.
#
#!/usr/bin/env python
parameter_list = [['features_string_char_compressed_modular.py']]
def features_string_char_compressed_modular (fname):
from modshogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE
from modshogun import UNCOMPRESSED,SNAPPY,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
from modshogun import DecompressCharString
f=StringFileCharFeatures(fname, RAWBYTE)
#print("original strings", f.get_features())
#uncompressed
f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_uncompressed.str", True)
#print("uncompressed strings", f2.get_features())
#print
# load compressed data and uncompress on load
#snappy - not stable yet?!
#f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9)
#f2=StringCharFeatures(RAWBYTE);
#f2.load_compressed("tmp/foo_snappy.str", True)
#print("snappy strings", f2.get_features())
#print
#lzo
f.save_compressed("tmp/foo_lzo.str", LZO, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_lzo.str", True)
#print("lzo strings", f2.get_features())
#print
##gzip
f.save_compressed("tmp/foo_gzip.str", GZIP, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_gzip.str", True)
#print("gzip strings", f2.get_features())
#print
#bzip2
f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_bzip2.str", True)
#print("bzip2 strings", f2.get_features())
#print
#lzma
f.save_compressed("tmp/foo_lzma.str", LZMA, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_lzma.str", True)
#print("lzma strings", f2.get_features())
#print
# load compressed data and uncompress via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_lzo.str", False)
f2.add_preprocessor(DecompressCharString(LZO))
f2.apply_preprocessor()
#print("lzo strings", f2.get_features())
#print
# load compressed data and uncompress on-the-fly via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("tmp/foo_lzo.str", False)
#f2.io.set_loglevel(MSG_DEBUG)
f2.add_preprocessor(DecompressCharString(LZO))
f2.enable_on_the_fly_preprocessing()
#print("lzo strings", f2.get_features())
#print
#clean up
import os
for f in ['tmp/foo_uncompressed.str', 'tmp/foo_snappy.str', 'tmp/foo_lzo.str', 'tmp/foo_gzip.str',
'tmp/foo_bzip2.str', 'tmp/foo_lzma.str', 'tmp/foo_lzo.str', 'tmp/foo_lzo.str']:
if os.path.exists(f):
os.unlink(f)
##########################################################################################
# some perfectly compressible stuff follows
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
if __name__=='__main__':
print('Compressing StringCharFileFeatures')
features_string_char_compressed_modular(*parameter_list[0])
# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun.
#!/usr/bin/env python
strings=['hey','guys','i','am','a','string']
parameter_list=[[strings]]
def features_string_char_modular (strings):
from modshogun import StringCharFeatures, RAWBYTE
from numpy import array
#create string features
f=StringCharFeatures(strings, RAWBYTE)
#and output several stats
#print("max string length", f.get_max_vector_length())
#print("number of strings", f.get_num_vectors())
#print("length of first string", f.get_vector_length(0))
#print("string[5]", ''.join(f.get_feature_vector(5)))
#print("strings", f.get_features())
#replace string 0
f.set_feature_vector(array(['t','e','s','t']), 0)
#print("strings", f.get_features())
return f.get_features(), f
if __name__=='__main__':
print('StringCharFeatures')
features_string_char_modular(*parameter_list[0])
# This example demonstrates how to load ASCII features from a file into shogun.
#!/usr/bin/env python
parameter_list = [['features_string_file_char_modular.py']]
def features_string_file_char_modular (fname):
from modshogun import StringFileCharFeatures, RAWBYTE
f = StringFileCharFeatures(fname, RAWBYTE)
#print("strings", f.get_features())
return f
if __name__=='__main__':
print('Compressing StringCharFileFeatures')
features_string_file_char_modular(*parameter_list[0])
# This example demonstrates how to load string features from files.
# We cover two cases: First, we show how to obtain StringCharFeatues
# from a directory of text files (particularly useful in computational biology)
# and second, we demonstrate how to load StringCharFeatues from one (multi-line) file.
#
#!/usr/bin/env python
parameter_list=[[".", "features_string_char_modular.py"]]
def features_string_file_modular (directory, fname):
from modshogun import StringCharFeatures, RAWBYTE
from modshogun import CSVFile
# load features from directory
f=StringCharFeatures(RAWBYTE)
f.load_from_directory(directory)
#and output several stats
#print("max string length", f.get_max_vector_length())
#print("number of strings", f.get_num_vectors())
#print("length of first string", f.get_vector_length(0))
#print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
#print("len(str[0])", f.get_vector_length(0))
#print("str[0]", f.get_feature_vector(0))
#or load features from file (one string per line)
fil=CSVFile(fname)
f.load(fil)
#print(f.get_features())
#or load fasta file
#f.load_fasta('fasta.fa')
#print(f.get_features())
return f.get_features(), f
if __name__=='__main__':
print('StringWordFeatures')
features_string_file_modular(*parameter_list[0])
# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted
# Degree kernel feature space via hashes. These features can be particularly fast
# in linear SVM solvers.
#!/usr/bin/env python
from modshogun import LongIntFeatures
from numpy import array, int64, all
# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
parameter_list = [[matrix,3,1,2],[matrix,3,1,2]]
# ... of type LongInt
def features_string_hashed_wd_modular (A=matrix,order=3,start_order=1,hash_bits=2):
a=LongIntFeatures(A)
from numpy import array, uint8
from modshogun import HashedWDFeatures, StringByteFeatures, RAWDNA
from modshogun import MSG_DEBUG
x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)]
from_order=order
f=StringByteFeatures(RAWDNA)
#f.io.set_loglevel(MSG_DEBUG)
f.set_features(x)
y=HashedWDFeatures(f,start_order,order,from_order,hash_bits)
fm=y.get_computed_dot_feature_matrix()
return fm
if __name__=='__main__':
print('string_hashed_wd')
features_string_hashed_wd_modular(*parameter_list[0])
# In this example, we demonstrate how to obtain string features
# by using a sliding window in a memory-efficient way. Instead of copying
# the string for each position of the sliding window, we only store a reference
# with respect to the complete string. This is particularly useful, when working
# with genomic data, where storing all explicitly copied strings in memory
# quickly becomes infeasible. In addition to a sliding window (of a particular
# length) over all position, we also support defining a custom position
# list.
#!/usr/bin/env python
# create string features with a single string
s=10*'A' + 10*'C' + 10*'G' + 10*'T'
parameter_list=[[s]]
def features_string_sliding_window_modular (strings):
from modshogun import StringCharFeatures, DNA
from modshogun import DynamicIntArray
f=StringCharFeatures([strings], DNA)
# slide a window of length 5 over features
# (memory efficient, does not copy strings)
f.obtain_by_sliding_window(5,1)
#print(f.get_num_vectors())
#print(f.get_vector_length(0))
#print(f.get_vector_length(1))
#print(f.get_features())
# slide a window of length 4 over features
# (memory efficient, does not copy strings)
f.obtain_by_sliding_window(4,1)
#print(f.get_num_vectors())
#print(f.get_vector_length(0))
#print(f.get_vector_length(1))
#print(f.get_features())
# extract string-windows at position 0,6,16,25 of window size 4
# (memory efficient, does not copy strings)
f.set_features([s])
positions=DynamicIntArray()
positions.append_element(0)
positions.append_element(6)
positions.append_element(16)
positions.append_element(25)
f.obtain_by_position_list(4,positions)
#print(f.get_features())
# now extract windows of size 8 from same positon list
f.obtain_by_position_list(8,positions)
#print(f.get_features())
return f
if __name__=='__main__':
print('Sliding Window')
features_string_sliding_window_modular(*parameter_list[0])
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 64bit (ulong).
#!/usr/bin/env python
parameter_list = [[0,2,0,False],[0,3,0,False]]
def features_string_ulong_modular (start=0,order=2,gap=0,rev=False):
from modshogun import StringCharFeatures, StringUlongFeatures, RAWBYTE
from numpy import array, uint64
#create string features
cf=StringCharFeatures(['hey','guys','string'], RAWBYTE)
uf=StringUlongFeatures(RAWBYTE)
uf.obtain_from_char(cf, start,order,gap,rev)
#replace string 0
uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0)
return uf.get_features(),uf.get_feature_vector(2), uf.get_num_vectors()
if __name__=='__main__':
print('simple_longint')
features_string_ulong_modular(*parameter_list[0])
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 16bit (word).
#!/usr/bin/env python
strings=['hey','guys','string']
parameter_list=[[strings,0,2,0,False]]
def features_string_word_modular (strings, start, order, gap, rev):
from modshogun import StringCharFeatures, StringWordFeatures, RAWBYTE
from numpy import array, uint16
#create string features
cf=StringCharFeatures(strings, RAWBYTE)
wf=StringWordFeatures(RAWBYTE)
wf.obtain_from_char(cf, start, order, gap, rev)
#and output several stats
#print("max string length", wf.get_max_vector_length())
#print("number of strings", wf.get_num_vectors())
#print("length of first string", wf.get_vector_length(0))
#print("string[2]", wf.get_feature_vector(2))
#print("strings", wf.get_features())
#replace string 0
wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0)
#print("strings", wf.get_features())
return wf.get_features(), wf
if __name__=='__main__':
print('StringWordFeatures')
features_string_word_modular(*parameter_list[0])
# In this example the ANOVA kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,2,10], [traindat,testdat,5,10]]
def kernel_anova_modular (train_fname=traindat,test_fname=testdat,cardinality=2, size_cache=10):
from modshogun import ANOVAKernel,RealFeatures,CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train, km_test, kernel
if __name__=='__main__':
print('ANOVA')
kernel_anova_modular(*parameter_list[0])
# This example demonstrates the use of the AUC Kernel, which
# can be used to maximize AUC instead of margin in SVMs.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,label_traindat,1.7], [traindat,label_traindat,1.6]]
def kernel_auc_modular (train_fname=traindat,label_fname=label_traindat,width=1.7):
from modshogun import GaussianKernel, AUCKernel, RealFeatures
from modshogun import BinaryLabels, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
subkernel=GaussianKernel(feats_train, feats_train, width)
kernel=AUCKernel(0, subkernel)
kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname)))
km_train=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print('AUC')
kernel_auc_modular(*parameter_list[0])
# In this example the Cauchy kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]]
def kernel_cauchy_modular (train_fname=traindat,test_fname=testdat, sigma=1.0):
from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=CauchyKernel(feats_train, feats_train, sigma, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Cauchy')
kernel_cauchy_modular(*parameter_list[0])
# This is an example for the initialization of the chi2-kernel on real data, where
# each column of the matrices corresponds to one training/test example.
#!/usr/bin/env python
traindat = '../data/fm_train_hist.dat'
testdat = '../data/fm_test_hist.dat'
parameter_list = [[traindat,testdat,1.4,10], [traindat,testdat,1.5,10]]
def kernel_chi2_modular (train_fname=traindat,test_fname=testdat,width=1.4, size_cache=10):
from modshogun import RealFeatures, Chi2Kernel, CSVFile, NormOne
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Chi2')
kernel_chi2_modular(*parameter_list[0])
# In this example the circular kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_circular_modular(train_fname=traindat,test_fname=testdat, sigma=1.0):
from modshogun import RealFeatures, CircularKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=CircularKernel(feats_train, feats_train, sigma, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Circular')
kernel_circular_modular(*parameter_list[0])
# In this example the combined kernel of custom kernel and poly kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list= [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]
def kernel_combined_custom_poly_modular (train_fname = traindat,test_fname = testdat,train_label_fname=label_traindat):
from modshogun import CombinedFeatures, RealFeatures, BinaryLabels
from modshogun import CombinedKernel, PolyKernel, CustomKernel
from modshogun import LibSVM, CSVFile
kernel = CombinedKernel()
feats_train = CombinedFeatures()
tfeats = RealFeatures(CSVFile(train_fname))
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_train = RealFeatures(CSVFile(train_fname))
feats_train.append_feature_obj(subkfeats_train)
subkernel = PolyKernel(10,2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = BinaryLabels(CSVFile(train_label_fname))
svm = LibSVM(1.0, kernel, labels)
svm.train()
kernel = CombinedKernel()
feats_pred = CombinedFeatures()
pfeats = RealFeatures(CSVFile(test_fname))
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, pfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_test = RealFeatures(CSVFile(test_fname))
feats_pred.append_feature_obj(subkfeats_test)
subkernel = PolyKernel(10, 2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_pred)
svm.set_kernel(kernel)
svm.apply()
km_train=kernel.get_kernel_matrix()
return km_train,kernel
if __name__=='__main__':
kernel_combined_custom_poly_modular(*parameter_list[0])
# This is an example for the initialization of a combined kernel, which is a weighted sum of
# in this case three kernels on real valued data. The sub-kernel weights are all set to 1.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = double(lm.load_numbers('../data/fm_test_real.dat'))
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,traindna,testdna],[traindat,testdat,traindna,testdna]]
def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA
kernel=CombinedKernel()
feats_train=CombinedFeatures()
feats_test=CombinedFeatures()
subkfeats_train=RealFeatures(fm_train_real)
subkfeats_test=RealFeatures(fm_test_real)
subkernel=GaussianKernel(10, 1.1)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
subkernel=FixedDegreeStringKernel(10, degree)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
subkernel=LocalAlignmentStringKernel(10)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Combined')
kernel_combined_modular(*parameter_list[0])
# This is an example for the initialization of the CommUlongString-kernel. This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat =lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,0,False ],[traindat,testdat,4,0,False]]
def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):
from modshogun import CommUlongStringKernel
from modshogun import StringUlongFeatures, StringCharFeatures, DNA
from modshogun import SortUlongString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('CommUlongString')
kernel_comm_ulong_string_modular(*parameter_list[0])
# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,4,0,False, False],[traindat,testdat,4,0,False,False]]
def kernel_comm_word_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse = False, use_sign = False):
from modshogun import CommWordStringKernel
from modshogun import StringWordFeatures, StringCharFeatures, DNA
from modshogun import SortWordString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('CommWordString')
kernel_comm_word_string_modular(*parameter_list[0])
# The constant kernel gives a trivial kernel matrix with all entries set to the same value
# defined by the argument 'c'.
#
#!/usr/bin/env python
parameter_list =[[23],[24]]
def kernel_const_modular (c=23):
from modshogun import DummyFeatures
from modshogun import ConstKernel
feats_train=DummyFeatures(10)
feats_test=DummyFeatures(17)
kernel=ConstKernel(feats_train, feats_train, c)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Const')
kernel_const_modular(*parameter_list[0])
# A user defined custom kernel is assigned in this example, for which only the lower triangle
# may be given (set_triangle_kernel_matrix_from_triangle) or
# a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a
# triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and
# the svm is used to classify the examples.
#
#!/usr/bin/env python
from numpy.random import seed
seed(42)
parameter_list=[[7],[8]]
def kernel_custom_modular (dim=7):
from numpy.random import rand, seed
from numpy import array, float32, int32
from modshogun import RealFeatures
from modshogun import CustomKernel
from modshogun import IndexFeatures
seed(17)
data=rand(dim, dim)
feats=RealFeatures(data)
symdata=data+data.T
lowertriangle=array([symdata[(x,y)] for x in range(symdata.shape[1])
for y in range(symdata.shape[0]) if y<=x])
kernel=CustomKernel()
# once with float64's
kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle)
km_triangletriangle=kernel.get_kernel_matrix()
kernel.set_triangle_kernel_matrix_from_full(symdata)
km_fulltriangle=kernel.get_kernel_matrix()
kernel.set_full_kernel_matrix_from_full(symdata)
km_fullfull=kernel.get_kernel_matrix()
# get subset of kernel
row_idx=array(range(3),dtype=int32)
col_idx=array(range(2),dtype=int32)
row_idx_feat=IndexFeatures(row_idx)
col_idx_feat=IndexFeatures(col_idx)
kernel.init(row_idx_feat, col_idx_feat)
km_sub_kernel=kernel.get_kernel_matrix()
# print('Subkernel(3x2):\n%s'%km_sub_kernel)
kernel.remove_all_col_subsets()
kernel.remove_all_row_subsets()
# now once with float32's
data=array(data,dtype=float32)
kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle)
km_triangletriangle=kernel.get_kernel_matrix()
kernel.set_triangle_kernel_matrix_from_full(symdata)
km_fulltriangle=kernel.get_kernel_matrix()
kernel.set_full_kernel_matrix_from_full(symdata)
km_fullfull=kernel.get_kernel_matrix()
return km_fullfull,kernel,km_sub_kernel
if __name__=='__main__':
print('Custom')
kernel_custom_modular(*parameter_list[0])
# This is an example for the initialization of the diag-kernel.
# The diag kernel has all kernel matrix entries but those on
# the main diagonal set to zero.
#!/usr/bin/env python
parameter_list =[[23],[24]]
def kernel_diag_modular (diag=23):
from modshogun import DummyFeatures
from modshogun import DiagKernel
feats_train=DummyFeatures(10)
feats_test=DummyFeatures(17)
kernel=DiagKernel(feats_train, feats_train, diag)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Diag')
kernel_diag_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
from modshogun import RealFeatures, MSG_DEBUG
traindat = numpy.random.random_sample((10,10))
testdat = numpy.random.random_sample((10,10))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]
def kernel_director_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
try:
from modshogun import DirectorKernel
except ImportError:
print("recompile shogun with --enable-swig-directors")
return
class DirectorLinearKernel(DirectorKernel):
def __init__(self):
DirectorKernel.__init__(self, True)
def kernel_function(self, idx_a, idx_b):
seq1 = self.get_lhs().get_feature_vector(idx_a)
seq2 = self.get_rhs().get_feature_vector(idx_b)
return numpy.dot(seq1, seq2)
from modshogun import LinearKernel, AvgDiagKernelNormalizer
from modshogun import Time
feats_train=RealFeatures(fm_train_real)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_train.parallel.set_num_threads(1)
feats_test=RealFeatures(fm_test_real)
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
dkernel=DirectorLinearKernel()
dkernel.set_normalizer(AvgDiagKernelNormalizer(scale))
dkernel.init(feats_train, feats_train)
#print "km_train"
t=Time()
km_train=kernel.get_kernel_matrix()
#t1=t.cur_time_diff(True)
#print "dkm_train"
t=Time()
dkm_train=dkernel.get_kernel_matrix()
#t2=t.cur_time_diff(True)
#print "km_train", km_train
#print "dkm_train", dkm_train
return km_train, dkm_train
if __name__=='__main__':
print('DirectorLinear')
kernel_director_linear_modular(*parameter_list[0])
# With the distance kernel one can use any of the following distance metrics:
# BrayCurtisDistance()
# CanberraMetric()
# CanberraWordDistance()
# ChebyshewMetric()
# ChiSquareDistance()
# CosineDistance()
# Distance()
# EuclidianDistance()
# GeodesicMetric()
# HammingWordDistance()
# JensenMetric()
# ManhattanMetric()
# ManhattanWordDistance()
# MinkowskiMetric()
# RealDistance()
# SimpleDistance()
# SparseDistance()
# SparseEuclidianDistance()
# StringDistance()
# TanimotoDistance()
#
#!/usr/bin/env python
testdat = '../data/fm_train_real.dat'
traindat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat,1.7],[traindat,testdat,1.8]]
def kernel_distance_modular (train_fname=traindat,test_fname=testdat,width=1.7):
from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance()
kernel=DistanceKernel(feats_train, feats_test, width, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Distance')
kernel_distance_modular(*parameter_list[0])
# In this example the distant segments kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,5,5],[traindat,testdat,6,6]]
def kernel_distantsegments_modular (fm_train_dna=traindat,fm_test_dna=testdat,delta=5, theta=5):
from modshogun import StringCharFeatures, DNA
from modshogun import DistantSegmentsKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train, km_test, kernel
if __name__=='__main__':
print('DistantSegments')
kernel_distantsegments_modular(*parameter_list[0])
# In this example the exponential kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0):
from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance = EuclideanDistance(feats_train, feats_train)
kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Exponential')
kernel_exponential_modular(*parameter_list[0])
# The class FKFeatures implements Fischer kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that FK-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1,4,1e-1,1,0,False,[1,False,True]],[traindat,testdat,label_traindat,3,4,1e-1,1,0,False,[1,False,True]]]
fm_hmm_pos=[ traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[ traindat[i] for i in where([label_traindat==-1])[1] ]
def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat,
label_train_dna=label_traindat,
N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
kargs=[1,False,True]):
from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
from modshogun import PolyKernel
from modshogun import HMM, BW_NORMAL#, MSG_DEBUG
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
#charfeat.io.set_loglevel(MSG_DEBUG)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=FKFeatures(10, pos, neg)
feats_train.set_opt_a(-1) #estimate prior
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=FKFeatures(10, pos_clone, neg_clone)
feats_test.set_a(feats_train.get_a()) #use prior from training data
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print("Fisher Kernel")
kernel_fisher_modular(*parameter_list[0])
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat, testdat,3],[traindat,testdat,4]]
def kernel_fixed_degree_string_modular (fm_train_dna=traindat, fm_test_dna=testdat,degree=3):
from modshogun import StringCharFeatures, DNA
from modshogun import FixedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('FixedDegreeString')
kernel_fixed_degree_string_modular(*parameter_list[0])
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.3],[traindat,testdat, 1.4]]
def kernel_gaussian_modular (train_fname=traindat,test_fname=testdat, width=1.3):
from modshogun import RealFeatures, GaussianKernel, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Gaussian')
kernel_gaussian_modular(*parameter_list[0])
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel.
# The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step
# size of the shifts and max_shift is the maximal shift.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat,1.8,2,1],[traindat,testdat,1.9,2,1]]
def kernel_gaussian_shift_modular (train_fname=traindat,test_fname=testdat,width=1.8,max_shift=2,shift_step=1):
from modshogun import RealFeatures, GaussianShiftKernel, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('GaussianShift')
kernel_gaussian_shift_modular(*parameter_list[0])
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindat,testdat,label_traindat,1,1e1, 1e0],[traindat,testdat,label_traindat,1,1e4,1e4]]
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):
from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
from modshogun import PluginEstimate#, MSG_DEBUG
charfeat=StringCharFeatures(DNA)
#charfeat.io.set_loglevel(MSG_DEBUG)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, 0, False)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, 0, False)
pie=PluginEstimate(ppseudo_count,npseudo_count)
labels=BinaryLabels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.apply().get_labels()
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('PluginEstimate w/ HistogramWord')
kernel_histogram_word_string_modular(*parameter_list[0])
# In this example the inverse multiquadic kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_inversemultiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
from modshogun import RealFeatures, InverseMultiQuadricKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=InverseMultiQuadricKernel(feats_train, feats_train, shift_coef, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('InverseMultiquadric')
kernel_inversemultiquadric_modular(*parameter_list[0])
# example on saving a kernel to a file
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat,1.9],[traindat,testdat,1.7]]
def kernel_io_modular (train_fname=traindat,test_fname=testdat,width=1.9):
from modshogun import RealFeatures, GaussianKernel, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
f=CSVFile("tmp/gaussian_train.csv","w")
kernel.save(f)
del f
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
f=CSVFile("tmp/gaussian_test.csv","w")
kernel.save(f)
del f
#clean up
import os
os.unlink("tmp/gaussian_test.csv")
os.unlink("tmp/gaussian_train.csv")
return km_train, km_test, kernel
if __name__=='__main__':
print('Gaussian')
kernel_io_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on raw byte
# data.
#!/usr/bin/env python
traindat = '../data/fm_train_byte.dat'
testdat = '../data/fm_test_byte.dat'
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_linear_byte_modular (train_fname=traindat,test_fname=testdat):
from modshogun import LinearKernel, ByteFeatures, CSVFile
feats_train=ByteFeatures(CSVFile(train_fname))
feats_test=ByteFeatures(CSVFile(test_fname))
kernel=LinearKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print('LinearByte')
kernel_linear_byte_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on real valued
# data using scaling factor 1.2.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]
def kernel_linear_modular (train_fname=traindat,test_fname=testdat,scale=1.2):
from modshogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Linear')
kernel_linear_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on string data. The
# strings are all of the same length and consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):
from modshogun import StringCharFeatures, DNA
from modshogun import LinearStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LinearStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
from tools.load import LoadMatrix
print('LinearString')
kernel_linear_string_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte)
# data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import ushort
lm=LoadMatrix()
traindat = ushort(lm.load_numbers('../data/fm_train_word.dat'))
testdat = ushort(lm.load_numbers('../data/fm_test_word.dat'))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.2]]
def kernel_linear_word_modular (fm_train_word=traindat,fm_test_word=testdat,scale=1.2):
from modshogun import LinearKernel, AvgDiagKernelNormalizer
from modshogun import WordFeatures
feats_train=WordFeatures(fm_train_word)
feats_test=WordFeatures(fm_test_word)
kernel=LinearKernel(feats_train, feats_train)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print('LinearWord')
kernel_linear_word_modular(*parameter_list[0])
# This is an example for the initialization of the local alignment kernel on
# DNA sequences, where each column of the matrices of type char corresponds to
# one training/test example.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_local_alignment_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):
from modshogun import StringCharFeatures, DNA
from modshogun import LocalAlignmentStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalAlignmentStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('LocalAlignmentString')
kernel_local_alignment_string_modular(*parameter_list[0])
# The LocalityImprovedString kernel is inspired by the polynomial kernel.
# Comparing neighboring characters it puts emphasize on local features.
#
# It can be defined as
# K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2},
# where
# I_i({\bf x},{\bf x'})=1
# if $x_i=x'_i and 0 otherwise.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat,5,5,7],[traindat,testdat,5,5,7]]
def kernel_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7):
from modshogun import StringCharFeatures, DNA
from modshogun import LocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('LocalityImprovedString')
kernel_locality_improved_string_modular(*parameter_list[0])
# In this example the log kernel (logarithm of the distance powered by degree plus one) is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]
def kernel_log_modular (train_fname=traindat,test_fname=testdat, degree=2.0):
from modshogun import RealFeatures, LogKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=LogKernel(feats_train, feats_train, degree, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Log')
kernel_log_modular(*parameter_list[0])
# In this example the match word string kernel is being computed for toy data
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat, 3,1.4,10,3,0,False],[
traindat,testdat, 3,1.4,10,3,0,False]]
def kernel_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False):
from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer
from modshogun import StringWordFeatures, StringCharFeatures, DNA
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=MatchWordStringKernel(size_cache, degree)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('MatchWordString')
kernel_match_word_string_modular(*parameter_list[0])
# In this example the multiquadric kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_multiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
from modshogun import RealFeatures, MultiquadricKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=MultiquadricKernel(feats_train, feats_train, shift_coef, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Multiquadric')
kernel_multiquadric_modular(*parameter_list[0])
# This is an example initializing the oligo string kernel which takes distances
# between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length
# of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is
# implemented for the DNA-alphabet 'ACGT'.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,1.2,10],[traindat,testdat,4,1.3,10]]
def kernel_oligo_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,k=3,width=1.2,size_cache=10):
from modshogun import StringCharFeatures, DNA
from modshogun import OligoStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=OligoStringKernel(size_cache, k, width)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('OligoString')
kernel_oligo_string_modular(*parameter_list[0])
# In this example the poly match string kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,False],[traindat,testdat,4,False]]
def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False):
from modshogun import PolyMatchStringKernel
from modshogun import StringCharFeatures, DNA
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_train_dna, DNA)
kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('PolyMatchString')
kernel_poly_match_string_modular(*parameter_list[0])
# This is an example for the initialization of the PolyMatchString kernel on string data.
# The PolyMatchString kernel sums over the matches of two stings of the same length and
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,2,True,3,0,False],[traindat,testdat,2,True,3,0,False]]
def kernel_poly_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
degree=2,inhomogene=True,order=3,gap=0,reverse=False):
from modshogun import PolyMatchWordStringKernel
from modshogun import StringWordFeatures, StringCharFeatures, DNA
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('PolyMatchWordString')
kernel_poly_match_word_string_modular(*parameter_list[0])
# This example initializes the polynomial kernel with real data.
# If variable 'inhomogene' is 'True' +1 is added to the scalar product
# before taking it to the power of 'degree'. If 'use_normalization' is
# set to 'true' then kernel matrix will be normalized by the square roots
# of the diagonal entries.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,4,False,True],[traindat,testdat,5,False,True]]
def kernel_poly_modular (train_fname=traindat,test_fname=testdat,degree=4,inhomogene=False,
use_normalization=True):
from modshogun import RealFeatures, PolyKernel, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=PolyKernel(
feats_train, feats_train, degree, inhomogene, use_normalization)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Poly')
kernel_poly_modular (*parameter_list[0])
# In this example the power kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]
def kernel_power_modular (train_fname=traindat,test_fname=testdat, degree=2.0):
from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=PowerKernel(feats_train, feats_train, degree, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Power')
kernel_power_modular(*parameter_list[0])
# In this example the rational quadratic kernel is being computed for toy data.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
distance=EuclideanDistance(feats_train, feats_train)
kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('RationalQuadratic')
kernel_rationalquadratic_modular(*parameter_list[0])
# The SalzbergWordString kernel implements the Salzberg kernel.
#
# It is described in
#
# Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites
# A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,3,0,False],[traindat,testdat,label_traindat,3,0,False]]
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
from modshogun import SalzbergWordStringKernel
from modshogun import PluginEstimate
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=BinaryLabels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.apply().get_labels()
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('PluginEstimate w/ SalzbergWord')
kernel_salzberg_word_string_modular(*parameter_list[0])
# The standard Sigmoid kernel computed on dense real valued features.
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,10,1.2,1.3],[traindat,testdat,10,1.2,1.3]]
def kernel_sigmoid_modular (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3):
from modshogun import RealFeatures, SigmoidKernel, CSVFile
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Sigmoid')
kernel_sigmoid_modular(*parameter_list[0])
# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,5,5,1],[traindat,testdat,5,3,2]]
def kernel_simple_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
length=5,inner_degree=5,outer_degree=1 ):
from modshogun import StringCharFeatures, DNA
from modshogun import SimpleLocalityImprovedStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=SimpleLocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('SimpleLocalityImprovedString')
kernel_simple_locality_improved_string_modular(*parameter_list[0])
# This example demonstrates how to use the Gaussian Kernel with sparse features.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]
def kernel_sparse_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.1 ):
from modshogun import SparseRealFeatures
from modshogun import GaussianKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('SparseGaussian')
kernel_sparse_gaussian_modular (*parameter_list[0])
# This example demonstrates how to use the Linear Kernel with sparse features.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]
def kernel_sparse_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.1):
from modshogun import SparseRealFeatures
from modshogun import LinearKernel, AvgDiagKernelNormalizer
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('SparseLinear')
kernel_sparse_linear_modular(*parameter_list[0])
# This example shows how to use the polynomial kernel with sparse features.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,10,3,True],[traindat,testdat,10,4,True]]
def kernel_sparse_poly_modular (fm_train_real=traindat,fm_test_real=testdat,
size_cache=10,degree=3,inhomogene=True ):
from modshogun import SparseRealFeatures
from modshogun import PolyKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=PolyKernel(feats_train, feats_train, size_cache,
inhomogene, degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('SparsePoly')
kernel_sparse_poly_modular(*parameter_list[0])
# In this example the spherical kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]
def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0):
from modshogun import RealFeatures
from modshogun import MultiquadricKernel
from modshogun import EuclideanDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclideanDistance(feats_train, feats_train)
kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Spherical')
kernel_spherical_modular(*parameter_list[0])
# In this example the spline kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_spline_modular (fm_train_real=traindat,fm_test_real=testdat):
from modshogun import RealFeatures
from modshogun import SplineKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=SplineKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Spline')
kernel_spline_modular(*parameter_list[0])
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2014 Soumyajit De
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,2,0.75],[traindat,testdat,3,0.75]]
def kernel_ssk_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, maxlen=1, decay=1):
from modshogun import SubsequenceStringKernel
from modshogun import StringCharFeatures, DNA
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)
km_train=kernel.get_kernel_matrix()
# print(km_train)
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
# print(km_test)
return km_train,km_test,kernel
if __name__=='__main__':
print('SubsequenceStringKernel DNA')
kernel_ssk_string_modular(*parameter_list[0])
kernel_ssk_string_modular(*parameter_list[1])
# The class TOPFeatures implements TOP kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that TOP-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[traindat[i] for i in where([label_traindat==-1])[1] ]
parameter_list = [[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True]], \
[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True] ]]
def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
order=1,gap=0,reverse=False,kargs=[1, False, True]):
from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
from modshogun import PolyKernel
from modshogun import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=TOPFeatures(10, pos, neg, False, False)
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print("TOP Kernel")
kernel_top_modular(*parameter_list[0])
# In this example the t-Student's kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]
def kernel_tstudent_modular (fm_train_real=traindat,fm_test_real=testdat, degree=2.0):
from modshogun import RealFeatures
from modshogun import TStudentKernel
from modshogun import EuclideanDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclideanDistance(feats_train, feats_train)
kernel=TStudentKernel(feats_train, feats_train, degree, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('TStudent')
kernel_tstudent_modular(*parameter_list[0])
# In this example the wave kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]]
def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0):
from modshogun import RealFeatures
from modshogun import WaveKernel
from modshogun import EuclideanDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclideanDistance(feats_train, feats_train)
kernel=WaveKernel(feats_train, feats_train, theta, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Wave')
kernel_wave_modular(*parameter_list[0])
# In this example the wavelet kernel is being computed for toy data.
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat, 1.5, 1.0],[traindat,testdat, 1.0, 1.5]]
def kernel_wavelet_modular (fm_train_real=traindat,fm_test_real=testdat, dilation=1.5, translation=1.0):
from modshogun import RealFeatures
from modshogun import WaveletKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=WaveletKernel(feats_train, feats_train, 10, dilation, translation)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('Wavelet')
kernel_wavelet_modular(*parameter_list[0])
# The WeightedCommWordString kernel may be used to compute the weighted
# spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer
# length is weighted by some coefficient \f$\beta_k\f$) from strings that have
# been mapped into unsigned 16bit integers.
#
# These 16bit integers correspond to k-mers. To applicable in this kernel they
# need to be sorted (e.g. via the SortWordString pre-processor).
#
# It basically uses the algorithm in the unix "comm" command (hence the name)
# to compute:
#
# k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'})
#
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
#
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
#
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
from modshogun import WeightedCommWordStringKernel
from modshogun import StringWordFeatures, StringCharFeatures, DNA
from modshogun import SortWordString
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
use_sign=False
kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('WeightedCommWordString')
kernel_weighted_comm_word_string_modular(*parameter_list[0])
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
#
# The WD-shift kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d, weighted by coefficients beta_k
# allowing for a positional tolerance of up to shift s.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,20],[traindat,testdat,22]]
def kernel_weighted_degree_position_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
from modshogun import StringCharFeatures, DNA
from modshogun import WeightedDegreePositionStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree)
from numpy import zeros,ones,float64,int32
kernel.set_shifts(10*ones(len(fm_train_dna[0]), dtype=int32))
kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('WeightedDegreePositionString')
kernel_weighted_degree_position_string_modular(*parameter_list[0])
# This examples shows how to create a Weighted Degree String Kernel from data
# and how to compute the kernel matrix from the resulting object.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3],[traindat,testdat,20]]
def kernel_weighted_degree_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
from modshogun import StringCharFeatures, DNA
from modshogun import WeightedDegreeStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
from numpy import arange,double
weights=arange(1,degree+1,dtype=double)[::-1]/ \
sum(arange(1,degree+1,dtype=double))
kernel.set_wd_weights(weights)
#from numpy import ones,float64,int32
#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
#this is how to serializate the kernel
#import pickle
#pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2)
#k=pickle.load(file('tmp/kernel_obj.dump','r'))
return km_train, km_test, kernel
if __name__=='__main__':
print('WeightedDegreeString')
kernel_weighted_degree_string_modular(*parameter_list[0])
#!/usr/bin/env python
parameter_list=[[]]
def labels_io_modular():
from modshogun import RegressionLabels, CSVFile
lab=RegressionLabels()
f=CSVFile("../data/label_train_regression.dat","r")
f.set_delimiter(" ")
lab.load(f)
#print lab.get_labels()
return lab
if __name__=='__main__':
print('Labels IO')
labels_io_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import *
from modshogun import *
x=array([[20.0,15,15],[10,20,20]])
y=array([[21.0,21,18],[19,19,22]])
z=array([[15.0,27,18],[32,5,23]])
parameter_list = [[x,concatenate((x,y,z),1)]]
def library_fisher2x3_modular (table, tables):
pval=Statistics_fishers_exact_test_for_2x3_table(table)
pvals=Statistics_fishers_exact_test_for_multiple_2x3_tables(tables)
return (pval,pvals)
if __name__=='__main__':
print('Fisher 2x3')
library_fisher2x3_modular(*parameter_list[0])
#!/usr/bin/env python
import time
from modshogun import Time
parameter_list = [[5],[1.0]]
def library_time (sleep_secs):
# measure wall clock time difference
t=Time()
time.sleep(sleep_secs)
diff=t.cur_time_diff()
# measure CPU time required
cpu_diff=t.cur_runtime_diff_sec()
# wall clock time should be above sleep_secs
# but cpu time should be tiny
#print diff, cpu_diff
return diff>sleep_secs, cpu_diff<0.5
if __name__=='__main__':
print('Time')
library_time(*parameter_list[0])
#!/usr/bin/env python
import numpy
from scipy.io import mmread
# Loading an example sparse matrix of dimension 479x479, real, unsymmetric
mtx=mmread('../../../data/logdet/west0479.mtx')
parameter_list=[[mtx,6000,10]]
def mathematics_linsolver_cg (matrix=mtx,max_iter=1000,seed=10):
# Create a Hermitian sparse matrix
from scipy.sparse import eye
rows=matrix.shape[0]
cols=matrix.shape[1]
A=matrix.transpose()*matrix+eye(rows, cols)
# Create a random vector (b) of the system Ax=b
numpy.random.seed(seed)
b=numpy.array(numpy.random.randn(rows))
# create linear system with linear operator and vector
from scipy.sparse import csc_matrix
try:
from shogun.Mathematics import RealSparseMatrixOperator
from shogun.Mathematics import ConjugateGradientSolver
op=RealSparseMatrixOperator(A.tocsc())
solver=ConjugateGradientSolver()
# set the iteration limit higher for poorly conditioned matrices
solver.set_iteration_limit(max_iter)
x=solver.solve(op, b)
# verifying the solution via direct solving
from scipy.sparse.linalg import spsolve, eigsh
y=spsolve(A,b)
print(numpy.linalg.norm(x-y))
return x
except ImportError:
print('Shogun not installed with Eigen3!')
if __name__=='__main__':
print('CG')
mathematics_linsolver_cg (*parameter_list[0])
#!/usr/bin/env python
from numpy import *
from scipy.io import mmread
# Loading an example sparse matrix of dimension 479x479, real, unsymmetric
mtx=mmread('../../../data/logdet/west0479.mtx')
parameter_list=[[mtx,100,60,1]]
def mathematics_logdet (matrix=mtx,max_iter_eig=1000,max_iter_lin=1000,num_samples=1):
from scipy.sparse import eye
# Create a Hermitian sparse matrix
rows=matrix.shape[0]
cols=matrix.shape[1]
A=matrix.transpose()*matrix+eye(rows, cols)
from scipy.sparse import csc_matrix
try:
from shogun.Mathematics import RealSparseMatrixOperator
from shogun.Mathematics import LanczosEigenSolver
from shogun.Mathematics import CGMShiftedFamilySolver
from shogun.Mathematics import LogRationalApproximationCGM
from shogun.Mathematics import ProbingSampler
from shogun.Mathematics import LogDetEstimator
from shogun.Mathematics import Statistics
from shogun.Library import SerialComputationEngine
# creating the linear operator, eigen-solver
op=RealSparseMatrixOperator(A.tocsc())
eig_solver=LanczosEigenSolver(op)
# we can set the iteration limit high for poorly conditioned matrices
eig_solver.set_max_iteration_limit(max_iter_eig)
# alternatively, if the matrix is small, we can compute eigenvalues externally
# and set min/max eigenvalues into the eigensolver
# from scipy.sparse.linalg import eigsh
# eigenvalues=eigsh(A, rows-1)
# eig_solver.set_min_eigenvalue(eigenvalues[0][0])
# eig_solver.set_max_eigenvalue(eigenvalues[0][-1])
# create the shifted-family linear solver which solves for all the shifts
# using as many matrix-vector products as one shift in CG iterations
lin_solver=CGMShiftedFamilySolver()
lin_solver.set_iteration_limit(max_iter_lin)
# computation engine
engine=SerialComputationEngine()
# set the desired accuracy tighter to obtain better results
# this determines the number of contour points in conformal mapping of
# the rational approximation of the Cauchy's integral of f(A)*s, f=log
desired_accuracy=1E-5
# creating the log-linear-operator function
op_func=LogRationalApproximationCGM(op, engine, eig_solver, lin_solver,\
desired_accuracy)
# set the trace sampler to be probing sampler, in which samples are obtained
# by greedy graph coloring of the power of sparse matrix (default is power=1,
# 2-distance coloring)
trace_sampler=ProbingSampler(op)
# estimating log-det
log_det_estimator=LogDetEstimator(trace_sampler, op_func, engine)
# set the number of samples as required
estimates=log_det_estimator.sample(num_samples)
estimated_logdet=sum(estimates)/len(estimates)
actual_logdet=Statistics.log_det(A)
print(actual_logdet, estimated_logdet)
return estimates
except ImportError:
print('One or many of the dependencies (Eigen3/LaPack/ColPack) not found!')
if __name__=='__main__':
print('LogDetEstimator')
mathematics_logdet (*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[data,0.0],[data,1.0]]
def mathematics_sparseinversecovariance_modular (data,lc):
try:
from modshogun import SparseInverseCovariance
except ImportError:
print("SparseInverseCovariance not available")
exit(0)
from numpy import dot
sic = SparseInverseCovariance()
S = dot(data,data.T)
Si = sic.estimate(S,lc)
return Si
if __name__=='__main__':
print('SparseInverseCovariance')
mathematics_sparseinversecovariance_modular(*parameter_list[0])
#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
parameter_list = [[traindat,testdat,label_traindat,3]]
def metric_lmnn_modular(train_fname=traindat,test_fname=testdat,label_train_fname=label_traindat,k=3):
try:
from modshogun import RealFeatures,MulticlassLabels,LMNN,KNN,CSVFile
except ImportError:
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(CSVFile(train_fname))
feats_test=RealFeatures(CSVFile(test_fname))
labels=MulticlassLabels(CSVFile(label_train_fname))
# LMNN
lmnn=LMNN(feats_train,labels,k)
lmnn.train()
lmnn_distance=lmnn.get_distance()
# perform classification with KNN
knn=KNN(k,lmnn_distance,labels)
knn.train()
output=knn.apply(feats_test).get_labels()
return lmnn,output
if __name__=='__main__':
print('LMNN')
metric_lmnn_modular(*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface. First, we create a number of base kernels.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLClassifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time.
# Finally, the example shows how to classify with a trained MKLClassifier.
#
#!/usr/bin/env python
from modshogun import CombinedFeatures, RealFeatures, BinaryLabels
from modshogun import CombinedKernel, PolyKernel, CustomKernel
from modshogun import MKLClassification
from tools.load import LoadMatrix
lm=LoadMatrix()
#only run example if SVMLight is included as LibSVM solver crashes in MKLClassification
try:
from modshogun import SVMLight
except ImportError:
print("SVMLight not available")
exit(0)
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]
# fm_train_real.shape
# fm_test_real.shape
# combined_custom()
def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat):
##################################
# set up and train
# create some poly train/test matrix
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K_train = tkernel.get_kernel_matrix()
pfeats = RealFeatures(fm_test_real)
tkernel.init(tfeats, pfeats)
K_test = tkernel.get_kernel_matrix()
# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(fm_train_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_train))
kernel.append_kernel(PolyKernel(10,2))
kernel.init(feats_train, feats_train)
# train mkl
labels = BinaryLabels(fm_label_twoclass)
mkl = MKLClassification()
# which norm to use for MKL
mkl.set_mkl_norm(1) #2,3
# set cost (neg, pos)
mkl.set_C(1, 1)
# set kernel and labels
mkl.set_kernel(kernel)
mkl.set_labels(labels)
# train
mkl.train()
#w=kernel.get_subkernel_weights()
#kernel.set_subkernel_weights(w)
##################################
# test
# create combined test features
feats_pred = CombinedFeatures()
feats_pred.append_feature_obj(RealFeatures(fm_test_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_test))
kernel.append_kernel(PolyKernel(10, 2))
kernel.init(feats_train, feats_pred)
# and classify
mkl.set_kernel(kernel)
mkl.apply()
return mkl.apply(),kernel
if __name__=='__main__':
mkl_binclass_modular (*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface for multi-class classification.
# First, we create a number of base kernels and features.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary
# classification version of MKL is that we can use more than two values as labels, when training
# the classifier.
# Finally, the example shows how to classify with a trained MKLMultiClass classifier.
#
#!/usr/bin/env python
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list=[
[ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5],
[ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2]]
def mkl_multiclass_modular (fm_train_real, fm_test_real, label_train_multiclass,
width, C, epsilon, num_threads, mkl_epsilon, mkl_norm):
from modshogun import CombinedFeatures, RealFeatures, MulticlassLabels
from modshogun import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
from modshogun import MKLMulticlass
kernel = CombinedKernel()
feats_train = CombinedFeatures()
feats_test = CombinedFeatures()
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = GaussianKernel(10, width)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = LinearKernel()
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = PolyKernel(10,2)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = MulticlassLabels(label_train_multiclass)
mkl = MKLMulticlass(C, kernel, labels)
mkl.set_epsilon(epsilon);
mkl.parallel.set_num_threads(num_threads)
mkl.set_mkl_epsilon(mkl_epsilon)
mkl.set_mkl_norm(mkl_norm)
mkl.train()
kernel.init(feats_train, feats_test)
out = mkl.apply().get_labels()
return out
if __name__ == '__main__':
print('mkl_multiclass')
mkl_multiclass_modular(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import array
from numpy import random
import math
from modshogun import CrossValidation, CrossValidationResult
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import BinaryLabels
from modshogun import RealFeatures
from modshogun import GaussianKernel, PowerKernel
from modshogun import LibSVM
from modshogun import MinkowskiMetric
from modshogun import GridSearchModelSelection
from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
from modshogun import ParameterCombination
from modshogun import Math
def create_param_tree():
root=ModelSelectionParameters()
c1=ModelSelectionParameters("C1")
root.append_child(c1)
c1.build_values(-1.0, 1.0, R_EXP)
c2=ModelSelectionParameters("C2")
root.append_child(c2)
c2.build_values(-1.0, 1.0, R_EXP)
gaussian_kernel=GaussianKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included, simply write to the mailing list
#gaussian_kernel.print_modsel_params()
param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel)
gaussian_kernel_width=ModelSelectionParameters("log_width")
gaussian_kernel_width.build_values(-math.log(2.0), 0.0, R_EXP, 1.0, 2.0)
param_gaussian_kernel.append_child(gaussian_kernel_width)
root.append_child(param_gaussian_kernel)
power_kernel=PowerKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included, simply write to the mailing list
#power_kernel.print_modsel_params()
param_power_kernel=ModelSelectionParameters("kernel", power_kernel)
root.append_child(param_power_kernel)
param_power_kernel_degree=ModelSelectionParameters("degree")
param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR)
param_power_kernel.append_child(param_power_kernel_degree)
metric=MinkowskiMetric(10)
# print all parameter available for modelselection
# Dont worry if yours is not included, simply write to the mailing list
#metric.print_modsel_params()
param_power_kernel_metric1=ModelSelectionParameters("distance", metric)
param_power_kernel.append_child(param_power_kernel_metric1)
param_power_kernel_metric1_k=ModelSelectionParameters("k")
param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR)
param_power_kernel_metric1.append_child(param_power_kernel_metric1_k)
return root
parameter_list = [[3,20,3]]
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
# init seed for reproducability
Math.init_random(1)
random.seed(1);
# create some (non-sense) data
matrix=random.rand(dim_vectors, num_vectors)
# create num_feautres 2-dimensional vectors
features=RealFeatures()
features.set_feature_matrix(matrix)
# create labels, two classes
labels=BinaryLabels(num_vectors)
for i in range(num_vectors):
labels.set_label(i, 1 if i%2==0 else -1)
# create svm
classifier=LibSVM()
# splitting strategy
splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)
# accuracy evaluation
evaluation_criterion=ContingencyTableEvaluation(ACCURACY)
# cross validation class for evaluation in model selection
cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
cross.set_num_runs(1)
# print all parameter available for modelselection
# Dont worry if yours is not included, simply write to the mailing list
#classifier.print_modsel_params()
# model parameter selection
param_tree=create_param_tree()
#param_tree.print_tree()
grid_search=GridSearchModelSelection(cross, param_tree)
print_state=False
best_combination=grid_search.select_model(print_state)
#print("best parameter(s):")
#best_combination.print_tree()
best_combination.apply_to_machine(classifier)
# larger number of runs to have less variance
cross.set_num_runs(10)
result=cross.evaluate()
casted=CrossValidationResult.obtain_from_generic(result);
#print "result mean:", casted.mean
return classifier,result,casted.mean
if __name__=='__main__':
print('ModelselectionGridSearchKernel')
modelselection_grid_search_kernel(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \
[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]]
def modelselection_grid_search_krr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import MeanSquaredError
from modshogun import CrossValidationSplitting
from modshogun import RegressionLabels
from modshogun import RealFeatures
from modshogun import KernelRidgeRegression
from modshogun import GridSearchModelSelection
from modshogun import ModelSelectionParameters
# training data
features_train=RealFeatures(traindat)
features_test=RealFeatures(testdat)
labels=RegressionLabels(label_traindat)
# labels
labels=RegressionLabels(label_train)
# predictor, set tau=0 here, doesnt matter
predictor=KernelRidgeRegression()
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but the standard
# "StratifiedCrossValidationSplitting" is also available
splitting_strategy=CrossValidationSplitting(labels, 5)
# evaluation method
evaluation_criterium=MeanSquaredError()
# cross-validation instance
cross_validation=CrossValidation(predictor, features_train, labels,
splitting_strategy, evaluation_criterium)
# (optional) repeat x-val (set larger to get better estimates)
cross_validation.set_num_runs(2)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#predictor.print_modsel_params()
# build parameter tree to select regularization parameter
param_tree_root=create_param_tree()
# model selection instance
model_selection=GridSearchModelSelection(cross_validation, param_tree_root)
# perform model selection with selected methods
#print "performing model selection of"
#print "parameter tree:"
#param_tree_root.print_tree()
#print "starting model selection"
# print the current parameter combination, if no parameter nothing is printed
print_state=False
best_parameters=model_selection.select_model(print_state)
# print best parameters
#print "best parameters:"
#best_parameters.print_tree()
# apply them and print result
best_parameters.apply_to_machine(predictor)
result=cross_validation.evaluate()
#print "mean:", result.mean
# creates all the parameters to optimize
def create_param_tree():
from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
from modshogun import ParameterCombination
from modshogun import GaussianKernel, PolyKernel
import math
root=ModelSelectionParameters()
tau=ModelSelectionParameters("tau")
root.append_child(tau)
# also R_LINEAR/R_LOG is available as type
min=-1
max=1
type=R_EXP
step=1.5
base=2
tau.build_values(min, max, type, step, base)
# gaussian kernel with width
gaussian_kernel=GaussianKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#gaussian_kernel.print_modsel_params()
param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel)
gaussian_kernel_width=ModelSelectionParameters("log_width");
gaussian_kernel_width.build_values(2.0*math.log(2.0), 2.5*math.log(2.0), R_LINEAR, 1.0)
param_gaussian_kernel.append_child(gaussian_kernel_width)
root.append_child(param_gaussian_kernel)
# polynomial kernel with degree
poly_kernel=PolyKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#poly_kernel.print_modsel_params()
param_poly_kernel=ModelSelectionParameters("kernel", poly_kernel)
root.append_child(param_poly_kernel)
# note that integers are used here
param_poly_kernel_degree=ModelSelectionParameters("degree")
param_poly_kernel_degree.build_values(1, 2, R_LINEAR)
param_poly_kernel.append_child(param_poly_kernel_degree)
return root
if __name__=='__main__':
print('ModelselectionGridSearchKRR')
modelselection_grid_search_krr_modular(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2011 Heiko Strathmann
# Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
#
from numpy.random import randn
from numpy import *
# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));
parameter_list = [[traindat,label_traindat]]
def modelselection_grid_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import GridSearchModelSelection
from modshogun import ModelSelectionParameters, R_EXP
from modshogun import ParameterCombination
from modshogun import BinaryLabels
from modshogun import RealFeatures
from modshogun import LibLinear, L2R_L2LOSS_SVC
# build parameter tree to select C1 and C2
param_tree_root=ModelSelectionParameters()
c1=ModelSelectionParameters("C1");
param_tree_root.append_child(c1)
c1.build_values(-1.0, 0.0, R_EXP);
c2=ModelSelectionParameters("C2");
param_tree_root.append_child(c2);
c2.build_values(-1.0, 0.0, R_EXP);
# training data
features=RealFeatures(traindat)
labels=BinaryLabels(label_traindat)
# classifier
classifier=LibLinear(L2R_L2LOSS_SVC)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#classifier.print_modsel_params()
# splitting strategy for cross-validation
splitting_strategy=StratifiedCrossValidationSplitting(labels, 10)
# evaluation method
evaluation_criterium=ContingencyTableEvaluation(ACCURACY)
# cross-validation instance
cross_validation=CrossValidation(classifier, features, labels,
splitting_strategy, evaluation_criterium)
cross_validation.set_autolock(False)
# model selection instance
model_selection=GridSearchModelSelection(cross_validation, param_tree_root)
# perform model selection with selected methods
#print "performing model selection of"
#param_tree_root.print_tree()
best_parameters=model_selection.select_model()
# print best parameters
#print "best parameters:"
#best_parameters.print_tree()
# apply them and print result
best_parameters.apply_to_machine(classifier)
result=cross_validation.evaluate()
#result.print_result()
if __name__=='__main__':
print('ModelSelectionGridSearchLibLinear')
modelselection_grid_search_liblinear_modular(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \
[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]]
def modelselection_grid_search_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import MeanSquaredError
from modshogun import CrossValidationSplitting
from modshogun import RegressionLabels
from modshogun import RealFeatures
from modshogun import GaussianKernel
from modshogun import LibSVR
from modshogun import GridSearchModelSelection
from modshogun import ModelSelectionParameters, R_EXP
from modshogun import ParameterCombination
# training data
features_train=RealFeatures(traindat)
labels=RegressionLabels(label_traindat)
# kernel
kernel=GaussianKernel(features_train, features_train, width)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#kernel.print_modsel_params()
labels=RegressionLabels(label_train)
# predictor
predictor=LibSVR(C, tube_epsilon, kernel, labels)
predictor.set_epsilon(epsilon)
# splitting strategy for 5 fold cross-validation (for classification its better
# to use "StratifiedCrossValidation", but the standard
# "StratifiedCrossValidationSplitting" is also available
splitting_strategy=CrossValidationSplitting(labels, 5)
# evaluation method
evaluation_criterium=MeanSquaredError()
# cross-validation instance
cross_validation=CrossValidation(predictor, features_train, labels,
splitting_strategy, evaluation_criterium)
# (optional) repeat x-val (set larger to get better estimates)
cross_validation.set_num_runs(2)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#predictor.print_modsel_params()
# build parameter tree to select C1 and C2
param_tree_root=ModelSelectionParameters()
c1=ModelSelectionParameters("C1");
param_tree_root.append_child(c1)
c1.build_values(-1.0, 0.0, R_EXP);
c2=ModelSelectionParameters("C2");
param_tree_root.append_child(c2);
c2.build_values(-1.0, 0.0, R_EXP);
# model selection instance
model_selection=GridSearchModelSelection(cross_validation, param_tree_root)
# perform model selection with selected methods
#print "performing model selection of"
#print "parameter tree"
#param_tree_root.print_tree()
#print "starting model selection"
# print the current parameter combination, if no parameter nothing is printed
print_state=False
# lock data before since model selection will not change the kernel matrix
# (use with care) This avoids that the kernel matrix is recomputed in every
# iteration of the model search
predictor.data_lock(labels, features_train)
best_parameters=model_selection.select_model(print_state)
# print best parameters
#print "best parameters:"
#best_parameters.print_tree()
# apply them and print result
best_parameters.apply_to_machine(predictor)
result=cross_validation.evaluate()
#print "mean:", result.mean
if __name__=='__main__':
print('ModelselectionGridSearchLibSVR')
modelselection_grid_search_libsvr_modular(*parameter_list[0])
# In this example a complex model parameters selection tree
# is being constructed
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2011-2012 Heiko Strathmann
# Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
#
parameter_list=[[None]]
def modelselection_parameter_tree_modular (dummy):
from modshogun import ParameterCombination
from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
from modshogun import PowerKernel
from modshogun import GaussianKernel
from modshogun import DistantSegmentsKernel
from modshogun import MinkowskiMetric
import math
root=ModelSelectionParameters()
combinations=root.get_combinations()
combinations.get_num_elements()
c=ModelSelectionParameters('C');
root.append_child(c)
c.build_values(1, 11, R_EXP)
power_kernel=PowerKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#power_kernel.print_modsel_params()
param_power_kernel=ModelSelectionParameters('kernel', power_kernel)
root.append_child(param_power_kernel)
param_power_kernel_degree=ModelSelectionParameters('degree')
param_power_kernel_degree.build_values(1, 1, R_EXP)
param_power_kernel.append_child(param_power_kernel_degree)
metric1=MinkowskiMetric(10)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#metric1.print_modsel_params()
param_power_kernel_metric1=ModelSelectionParameters('distance', metric1)
param_power_kernel.append_child(param_power_kernel_metric1)
param_power_kernel_metric1_k=ModelSelectionParameters('k')
param_power_kernel_metric1_k.build_values(1, 12, R_LINEAR)
param_power_kernel_metric1.append_child(param_power_kernel_metric1_k)
gaussian_kernel=GaussianKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#gaussian_kernel.print_modsel_params()
param_gaussian_kernel=ModelSelectionParameters('kernel', gaussian_kernel)
root.append_child(param_gaussian_kernel)
param_gaussian_kernel_width=ModelSelectionParameters('log_width')
param_gaussian_kernel_width.build_values(0.0, 0.5*math.log(2.0), R_LINEAR)
param_gaussian_kernel.append_child(param_gaussian_kernel_width)
ds_kernel=DistantSegmentsKernel()
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#ds_kernel.print_modsel_params()
param_ds_kernel=ModelSelectionParameters('kernel', ds_kernel)
root.append_child(param_ds_kernel)
param_ds_kernel_delta=ModelSelectionParameters('delta')
param_ds_kernel_delta.build_values(1, 2, R_EXP)
param_ds_kernel.append_child(param_ds_kernel_delta)
param_ds_kernel_theta=ModelSelectionParameters('theta')
param_ds_kernel_theta.build_values(1, 2, R_EXP)
param_ds_kernel.append_child(param_ds_kernel_theta)
# root.print_tree()
combinations=root.get_combinations()
# for i in range(combinations.get_num_elements()):
# params = ParameterCombination.obtain_from_generic(combinations.get_element(i))
# params.print_tree()
return
if __name__=='__main__':
print('ModelSelection ParameterTree')
modelselection_parameter_tree_modular(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Copyright (C) 2012 Sergey Lisitsyn
from numpy import *
from numpy.random import randn
# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));
parameter_list = [[traindat,label_traindat]]
def modelselection_random_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat):
from modshogun import CrossValidation, CrossValidationResult
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import RandomSearchModelSelection
from modshogun import ModelSelectionParameters, R_EXP
from modshogun import ParameterCombination
from modshogun import BinaryLabels
from modshogun import RealFeatures
from modshogun import LibLinear, L2R_L2LOSS_SVC
# build parameter tree to select C1 and C2
param_tree_root=ModelSelectionParameters()
c1=ModelSelectionParameters("C1");
param_tree_root.append_child(c1)
c1.build_values(-2.0, 2.0, R_EXP);
c2=ModelSelectionParameters("C2");
param_tree_root.append_child(c2);
c2.build_values(-2.0, 2.0, R_EXP);
# training data
features=RealFeatures(traindat)
labels=BinaryLabels(label_traindat)
# classifier
classifier=LibLinear(L2R_L2LOSS_SVC)
# print all parameter available for modelselection
# Dont worry if yours is not included but, write to the mailing list
#classifier.print_modsel_params()
# splitting strategy for cross-validation
splitting_strategy=StratifiedCrossValidationSplitting(labels, 10)
# evaluation method
evaluation_criterium=ContingencyTableEvaluation(ACCURACY)
# cross-validation instance
cross_validation=CrossValidation(classifier, features, labels,
splitting_strategy, evaluation_criterium)
cross_validation.set_autolock(False)
# model selection instance
model_selection=RandomSearchModelSelection(cross_validation, param_tree_root, 0.5)
# perform model selection with selected methods
#print "performing model selection of"
#param_tree_root.print_tree()
best_parameters=model_selection.select_model()
# print best parameters
#print "best parameters:"
#best_parameters.print_tree()
# apply them and print result
best_parameters.apply_to_machine(classifier)
result=cross_validation.evaluate()
#result.print_result()
if __name__=='__main__':
print('ModelSelectionRandomSearchLibLinear')
modelselection_random_search_liblinear_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])
parameter_list = [[traindat,testdat,label_traindat,feattypes]]
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
try:
from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
from numpy import random, int32
except ImportError:
print("Could not import Shogun and/or numpy modules")
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(CSVFile(train))
feats_test=RealFeatures(CSVFile(test))
train_labels=MulticlassLabels(CSVFile(labels))
# divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
subset=int32(random.permutation(feats_train.get_num_vectors()))
vsubset=subset[1:subset.size/3]
trsubset=subset[1+subset.size/3:subset.size]
# C4.5 Tree formation using training subset
train_labels.add_subset(trsubset)
feats_train.add_subset(trsubset)
c=C45ClassifierTree()
c.set_labels(train_labels)
c.set_feature_types(ft)
c.train(feats_train)
train_labels.remove_subset()
feats_train.remove_subset()
# prune tree using validation subset
train_labels.add_subset(vsubset)
feats_train.add_subset(vsubset)
c.prune_tree(feats_train,train_labels)
train_labels.remove_subset()
feats_train.remove_subset()
# Classify test data
output=c.apply_multiclass(feats_test).get_labels()
output_certainty=c.get_certainty_vector()
return c,output,output_certainty
if __name__=='__main__':
print('C45ClassifierTree')
multiclass_c45classifiertree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])
parameter_list = [[traindat,testdat,label_traindat,feattypes]]
def multiclass_cartree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
try:
from modshogun import RealFeatures, MulticlassLabels, CSVFile, CARTree, PT_MULTICLASS
except ImportError:
print("Could not import Shogun modules")
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(CSVFile(train))
feats_test=RealFeatures(CSVFile(test))
train_labels=MulticlassLabels(CSVFile(labels))
# CART Tree formation with 5 fold cross-validation pruning
c=CARTree(ft,PT_MULTICLASS,5,True)
c.set_labels(train_labels)
c.train(feats_train)
# Classify test data
output=c.apply_multiclass(feats_test).get_labels()
return c,output
if __name__=='__main__':
print('CARTree')
multiclass_cartree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array, dtype, int32
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
# set both input attributes as continuous i.e. 2
feattypes = array([2, 2],dtype=int32)
parameter_list = [[traindat,testdat,label_traindat,feattypes]]
def multiclass_chaidtree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
try:
from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree
except ImportError:
print("Could not import Shogun modules")
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(CSVFile(train))
feats_test=RealFeatures(CSVFile(test))
train_labels=MulticlassLabels(CSVFile(labels))
# CHAID Tree formation with nominal dependent variable
c=CHAIDTree(0,feattypes,10)
c.set_labels(train_labels)
c.train(feats_train)
# Classify test data
output=c.apply_multiclass(feats_test).get_labels()
return c,output
if __name__=='__main__':
print('CHAIDTree')
multiclass_chaidtree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
# create data
train_data = array([[1.0, 2.0, 1.0, 3.0, 1.0, 3.0, 2.0, 2.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 2.0],
[2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0],
[3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0],
[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]])
train_labels = array([1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0])
test_data = array([[2.0, 2.0, 1.0, 3.0, 3.0],
[2.0, 1.0, 2.0, 1.0, 2.0],
[3.0, 2.0, 1.0, 3.0, 2.0],
[1.0, 2.0, 1.0, 2.0, 1.0]])
parameter_list = [[train_data, train_labels, test_data]]
def multiclass_id3classifiertree_modular(train=train_data,labels=train_labels,test=test_data):
try:
from modshogun import RealFeatures, MulticlassLabels, ID3ClassifierTree
except ImportError:
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(train)
feats_test=RealFeatures(test)
feats_labels=MulticlassLabels(labels)
# ID3 Tree formation
id3=ID3ClassifierTree()
id3.set_labels(feats_labels)
id3.train(feats_train)
# Classify test data
output=id3.apply_multiclass(feats_test).get_labels()
return id3,output
if __name__=='__main__':
print('ID3ClassifierTree')
multiclass_id3classifiertree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])
parameter_list = [[traindat,testdat,label_traindat,feattypes]]
def multiclass_randomforest_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
try:
from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote
except ImportError:
print("Could not import Shogun modules")
return
# wrap features and labels into Shogun objects
feats_train=RealFeatures(CSVFile(train))
feats_test=RealFeatures(CSVFile(test))
train_labels=MulticlassLabels(CSVFile(labels))
# Random Forest formation
rand_forest=RandomForest(feats_train,train_labels,20,1)
rand_forest.set_feature_types(ft)
rand_forest.set_combination_rule(MajorityVote())
rand_forest.train()
# Classify test data
output=rand_forest.apply_multiclass(feats_test).get_labels()
return rand_forest,output
if __name__=='__main__':
print('RandomForest')
multiclass_randomforest_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[data, 20], [data, 30]]
def preprocessor_dimensionreductionpreprocessor_modular (data, k):
from modshogun import RealFeatures
from modshogun import DimensionReductionPreprocessor
try:
from modshogun import LocallyLinearEmbedding
except ImportError:
print("LocallyLinearEmbedding not available")
exit(0)
features = RealFeatures(data)
converter = LocallyLinearEmbedding()
converter.set_k(k)
preprocessor = DimensionReductionPreprocessor(converter)
preprocessor.init(features)
preprocessor.apply_to_feature_matrix(features)
return features
if __name__=='__main__':
print('DimensionReductionPreprocessor')
preprocessor_dimensionreductionpreprocessor_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
from modshogun import *
lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
labels = lm.load_numbers('../data/label_train_multiclass.dat')
parameter_list = [[data, labels, CANVAR_FLDA], [data, labels, CLASSIC_FLDA]]
def preprocessor_fisherlda_modular (data, labels, method):
from modshogun import RealFeatures, MulticlassLabels, CANVAR_FLDA
from modshogun import FisherLda
from modshogun import MulticlassLabels
sg_features = RealFeatures(data)
sg_labels = MulticlassLabels(labels)
preprocessor=FisherLda(method)
preprocessor.fit(sg_features, sg_labels, 1)
yn=preprocessor.apply_to_feature_matrix(sg_features)
return yn
if __name__=='__main__':
print('FisherLda')
preprocessor_fisherlda_modular(*parameter_list[0])
# In this example toy data is being processed using the kernel PCA algorithm
# as described in
#
# Schölkopf, B., Smola, A. J., & Muller, K. R. (1999).
# Kernel Principal Component Analysis.
# Advances in kernel methods support vector learning, 1327(3), 327-352. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.8744i
#
# A gaussian kernel is used for the processing.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[data, 0.01, 1.0], [data, 0.05, 2.0]]
def preprocessor_kernelpca_modular (data, threshold, width):
from modshogun import RealFeatures
from modshogun import KernelPCA
from modshogun import GaussianKernel
features = RealFeatures(data)
kernel = GaussianKernel(features,features,width)
preprocessor = KernelPCA(kernel)
preprocessor.init(features)
preprocessor.set_target_dim(2)
preprocessor.apply_to_feature_matrix(features)
return features
if __name__=='__main__':
print('KernelPCA')
preprocessor_kernelpca_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat+10,testdat+10,1.4,10],[traindat+10,testdat+10,1.5,10]]
def preprocessor_logplusone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from modshogun import Chi2Kernel
from modshogun import RealFeatures
from modshogun import LogPlusOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=LogPlusOne()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('LogPlusOne')
preprocessor_logplusone_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# NormOne, normalizes vectors to have norm 1.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.4,10],[traindat,testdat,1.5,10]]
def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from modshogun import Chi2Kernel
from modshogun import RealFeatures
from modshogun import NormOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preprocessor=NormOne()
preprocessor.init(feats_train)
feats_train.add_preprocessor(preprocessor)
feats_train.apply_preprocessor()
feats_test.add_preprocessor(preprocessor)
feats_test.apply_preprocessor()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('NormOne')
preprocessor_normone_modular(*parameter_list[0])
# In this example toy data is being processed using the
# Principal Component Analysis.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[data]]
def preprocessor_pca_modular (data):
from modshogun import RealFeatures
from modshogun import PCA
features = RealFeatures(data)
preprocessor = PCA()
preprocessor.init(features)
preprocessor.apply_to_feature_matrix(features)
return features
if __name__=='__main__':
print('PCA')
preprocessor_pca_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# PruneVarSubMean substracts the mean from each feature and removes features that
# have zero variance.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]]
def preprocessor_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from modshogun import Chi2Kernel
from modshogun import RealFeatures
from modshogun import PruneVarSubMean
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=PruneVarSubMean()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('PruneVarSubMean')
preprocessor_prunevarsubmean_modular(*parameter_list[0])
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]]
from modshogun import Math_init_random;
Math_init_random(12345);
def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from modshogun import Chi2Kernel
from modshogun import RealFeatures
from modshogun import RandomFourierGaussPreproc
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=RandomFourierGaussPreproc()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('RandomFourierGaussPreproc')
preprocessor_randomfouriergausspreproc_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The
# CommUlongString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 64bit integers. These 64bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortUlongString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation enables
# spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for
# 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented
# (though there is room for improvement here when a whole set of sequences is
# ADDed) using sorted lists.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,4,0,False,False],[traindna,testdna,3,0,False,False]]
def preprocessor_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):
from modshogun import CommUlongStringKernel
from modshogun import StringCharFeatures, StringUlongFeatures, DNA
from modshogun import SortUlongString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('CommUlongString')
preprocessor_sortulongstring_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The
# CommWordString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 16bit integers. These 16bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortWordString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation is
# especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it
# enables spectrum kernels of order up to 8. For this kernel the linadd speedups
# are quite efficiently implemented using direct maps.
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,3,0,False,False],[traindna,testdna,3,0,False,False]]
def preprocessor_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):
from modshogun import CommWordStringKernel
from modshogun import StringCharFeatures, StringWordFeatures, DNA
from modshogun import SortWordString
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print('CommWordString')
preprocessor_sortwordstring_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
# set both input attributes as not nominal (ie. continuous)
feattypes = array([False])
parameter_list = [[50,5,15,0.2,feattypes]]
def regression_cartree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
try:
from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, PT_REGRESSION
from numpy import random
except ImportError:
print("Could not import Shogun and/or numpy modules")
return
random.seed(1)
# form training dataset : y=x with noise
X_train=random.rand(1,num_train)*x_range;
Y_train=X_train+random.randn(num_train)*noise_var
# form test dataset
X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])
# wrap features and labels into Shogun objects
feats_train=RealFeatures(X_train)
feats_test=RealFeatures(X_test)
train_labels=RegressionLabels(Y_train[0])
# CART Tree formation
c=CARTree(ft,PT_REGRESSION,5,True)
c.set_labels(train_labels)
c.train(feats_train)
# Classify test data
output=c.apply_regression(feats_test).get_labels()
return c,output
if __name__=='__main__':
print('CARTree')
regression_cartree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array, dtype, int32
# set input attribute as continuous i.e. 2
feattypes = array([2],dtype=int32)
parameter_list = [[500,50,15,0.2,feattypes]]
def regression_chaidtree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
try:
from modshogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION
from numpy import random
except ImportError:
print("Could not import Shogun and/or numpy modules")
return
random.seed(1)
# form training dataset : y=x with noise
X_train=random.rand(1,num_train)*x_range;
Y_train=X_train+random.randn(num_train)*noise_var
# form test dataset
X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])
# wrap features and labels into Shogun objects
feats_train=RealFeatures(X_train)
feats_test=RealFeatures(X_test)
train_labels=RegressionLabels(Y_train[0])
# CHAID Tree formation
c=CHAIDTree(2,feattypes,50)
c.set_labels(train_labels)
c.train(feats_train)
# Regress on test data
output=c.apply_regression(feats_test).get_labels()
return c,output
if __name__=='__main__':
print('CHAIDTree')
regression_chaidtree_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array, random
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'
# set input attribute as not nominal (ie. continuous)
feattypes = array([False])
parameter_list = [[500,50,15,0.2,feattypes]]
def regression_randomforest_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
try:
from modshogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION
except ImportError:
print("Could not import Shogun modules")
return
random.seed(1)
# form training dataset : y=x with noise
X_train=random.rand(1,num_train)*x_range;
Y_train=X_train+random.randn(num_train)*noise_var
# form test dataset
X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])
# wrap features and labels into Shogun objects
feats_train=RealFeatures(X_train)
feats_test=RealFeatures(X_test)
train_labels=RegressionLabels(Y_train[0])
# Random Forest formation
rand_forest=RandomForest(feats_train,train_labels,20,1)
rand_forest.set_feature_types(ft)
rand_forest.set_machine_problem_type(PT_REGRESSION)
rand_forest.set_combination_rule(MeanRule())
rand_forest.train()
# Regress test data
output=rand_forest.apply_regression(feats_test).get_labels()
return rand_forest,output
if __name__=='__main__':
print('RandomForest')
regression_randomforest_modular(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The the label of both the train and the test data are
# fetched via svr.classify().get_labels().
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#!/usr/bin/env python
###########################################################################
# svm light based support vector regression
###########################################################################
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,1.2,1,1e-5,1e-2,1],[traindat,testdat,label_traindat,2.3,0.5,1e-5,1e-6,1]]
def regression_svrlight_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \
width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3):
from modshogun import RegressionLabels, RealFeatures
from modshogun import GaussianKernel
try:
from modshogun import SVRLight
except ImportError:
print('No support for SVRLight available.')
return
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=RegressionLabels(label_train)
svr=SVRLight(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.parallel.set_num_threads(num_threads)
svr.train()
kernel.init(feats_train, feats_test)
out = svr.apply().get_labels()
return out, kernel
if __name__=='__main__':
print('SVRLight')
regression_svrlight_modular(*parameter_list[0])
# In this example serialization of SVM (Support Vector Machine) is shown
#!/usr/bin/env python
parameter_list=[[10,0.3,2, 1.0, 0.1]]
def check_status(status,suffix):
# silent...
assert status, "ERROR reading/writing status:%s/suffic:%s\n" % (status,suffix)
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10):
import os
from numpy import concatenate, zeros, ones
from numpy.random import randn, seed
from modshogun import RealFeatures, MulticlassLabels
from modshogun import GMNPSVM
from modshogun import GaussianKernel
from modshogun import SerializableHdf5File,SerializableAsciiFile, \
SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
from modshogun import NormOne, LogPlusOne
seed(17)
data=concatenate((randn(dim, num), randn(dim, num) + dist,
randn(dim, num) + 2*dist,
randn(dim, num) + 3*dist), axis=1)
lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))
feats=RealFeatures(data)
#feats.io.set_loglevel(MSG_DEBUG)
#feats.io.enable_file_and_line()
kernel=GaussianKernel(feats, feats, width)
labels=MulticlassLabels(lab)
svm = GMNPSVM(C, kernel, labels)
feats.add_preprocessor(NormOne())
feats.add_preprocessor(LogPlusOne())
feats.set_preprocessed(1)
svm.train(feats)
bias_ref = svm.get_svm(0).get_bias()
#svm.print_serializable()
fstream = SerializableHdf5File("tmp/blaah.h5", "w")
status = svm.save_serializable(fstream)
check_status(status,'h5')
fstream = SerializableAsciiFile("tmp/blaah.asc", "w")
status = svm.save_serializable(fstream)
check_status(status,'asc')
fstream = SerializableJsonFile("tmp/blaah.json", "w")
status = svm.save_serializable(fstream)
check_status(status,'json')
fstream = SerializableXmlFile("tmp/blaah.xml", "w")
status = svm.save_serializable(fstream)
check_status(status,'xml')
fstream = SerializableHdf5File("tmp/blaah.h5", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status,'h5')
new_svm.train()
bias_h5 = new_svm.get_svm(0).get_bias()
fstream = SerializableAsciiFile("tmp/blaah.asc", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status,'asc')
new_svm.train()
bias_asc = new_svm.get_svm(0).get_bias()
fstream = SerializableJsonFile("tmp/blaah.json", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status,'json')
new_svm.train()
bias_json = new_svm.get_svm(0).get_bias()
fstream = SerializableXmlFile("tmp/blaah.xml", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status,'xml')
new_svm.train()
bias_xml = new_svm.get_svm(0).get_bias()
os.unlink("tmp/blaah.h5")
os.unlink("tmp/blaah.asc")
os.unlink("tmp/blaah.json")
os.unlink("tmp/blaah.xml")
return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
if __name__=='__main__':
print('Serialization SVMLight')
serialization_complex_example(*parameter_list[0])
# In this example dense toy features is being serialized
#!/usr/bin/env python
from modshogun import *
from numpy import array
import os
parameter_list=[[[[1.0,2,3],[4,5,6]]]]
def serialization_matrix_modular (m):
feats=RealFeatures(array(m))
#feats.io.set_loglevel(0)
fstream = SerializableAsciiFile("tmp/foo.asc", "w")
feats.save_serializable(fstream)
l=MulticlassLabels(array([1.0,2,3]))
fstream = SerializableAsciiFile("tmp/foo2.asc", "w")
l.save_serializable(fstream)
os.unlink("tmp/foo.asc")
os.unlink("tmp/foo2.asc")
if __name__=='__main__':
print('Serialization Matrix Modular')
serialization_matrix_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import WeightedDegreeStringKernel, LinearKernel, PolyKernel, GaussianKernel, CTaxonomy
from modshogun import CombinedKernel, WeightedDegreeRBFKernel
from modshogun import StringCharFeatures, RealFeatures, CombinedFeatures, StringWordFeatures, SortWordString
from modshogun import DNA, PROTEIN, Labels
from modshogun import WeightedDegreeStringKernel, CombinedKernel, WeightedCommWordStringKernel, WeightedDegreePositionStringKernel
from modshogun import StringCharFeatures, DNA, StringWordFeatures, CombinedFeatures
from modshogun import MSG_DEBUG
from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet
from modshogun import WeightedDegreeStringKernel, GaussianKernel
try:
from modshogun import SVMLight
except ImportError:
print("SVMLight is not available")
exit(0)
from numpy import concatenate, ones
from numpy.random import randn, seed
import numpy
import sys
import types
import random
import bz2
import pickle
import inspect
###################################################
# Random Data
###################################################
def generate_random_string(length, number):
"""
generate sample over alphabet
"""
dat = []
alphabet = "AGTC"
for i in range(number):
dat.append("".join([random.choice(alphabet) for j in range(length)]))
return dat
def generate_random_data(number):
"""
create random examples and labels
"""
labels = numpy.array([random.choice([-1.0, 1.0]) for i in range(number)])
examples = numpy.array(generate_random_string(22, number))
return examples, labels
def save(filename, myobj):
"""
save object to file using pickle
@param filename: name of destination file
@type filename: str
@param myobj: object to save (has to be pickleable)
@type myobj: obj
"""
try:
f = bz2.BZ2File(filename, 'wb')
except IOError as details:
sys.stderr.write('File ' + filename + ' cannot be written\n')
sys.stderr.write(details)
return
pickle.dump(myobj, f, protocol=2)
f.close()
def load(filename):
"""
Load from filename using pickle
@param filename: name of file to load from
@type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except IOError as details:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = pickle.load(f)
f.close()
return myobj
def get_spectrum_features(data, order=3, gap=0, reverse=True):
"""
create feature object used by spectrum kernel
"""
charfeat = StringCharFeatures(data, DNA)
feat = StringWordFeatures(charfeat.get_alphabet())
feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc = SortWordString()
preproc.init(feat)
feat.add_preprocessor(preproc)
feat.apply_preprocessor()
return feat
def get_wd_features(data, feat_type="dna"):
"""
create feature object for wdk
"""
if feat_type == "dna":
feat = StringCharFeatures(DNA)
elif feat_type == "protein":
feat = StringCharFeatures(PROTEIN)
else:
raise Exception("unknown feature type")
feat.set_features(data)
return feat
def construct_features(features):
"""
makes a list
"""
feat_all = [inst for inst in features]
feat_lhs = [inst[0:15] for inst in features]
feat_rhs = [inst[15:] for inst in features]
feat_wd = get_wd_features(feat_all)
feat_spec_1 = get_spectrum_features(feat_lhs, order=3)
feat_spec_2 = get_spectrum_features(feat_rhs, order=3)
feat_comb = CombinedFeatures()
feat_comb.append_feature_obj(feat_wd)
feat_comb.append_feature_obj(feat_spec_1)
feat_comb.append_feature_obj(feat_spec_2)
return feat_comb
parameter_list = [[200, 1, 100]]
def serialization_string_kernels_modular(n_data, num_shifts, size):
"""
serialize svm with string kernels
"""
##################################################
# set up toy data and svm
train_xt, train_lt = generate_random_data(n_data)
test_xt, test_lt = generate_random_data(n_data)
feats_train = construct_features(train_xt)
feats_test = construct_features(test_xt)
max_len = len(train_xt[0])
kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts
kernel_wdk.set_shifts(shifts_vector)
########
# set up spectrum
use_sign = False
kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign)
kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign)
########
# combined kernel
kernel = CombinedKernel()
kernel.append_kernel(kernel_wdk)
kernel.append_kernel(kernel_spec_1)
kernel.append_kernel(kernel_spec_2)
# init kernel
labels = BinaryLabels(train_lt);
svm = SVMLight(1.0, kernel, labels)
#svm.io.set_loglevel(MSG_DEBUG)
svm.train(feats_train)
##################################################
# serialize to file
fn = "serialized_svm.bz2"
#print("serializing SVM to file", fn)
save(fn, svm)
##################################################
# unserialize and sanity check
#print("unserializing SVM")
svm2 = load(fn)
#print("comparing predictions")
out = svm.apply(feats_test).get_labels()
out2 = svm2.apply(feats_test).get_labels()
# assert outputs are close
for i in range(len(out)):
assert abs(out[i] - out2[i] < 0.000001)
#print("all checks passed.")
return out,out2
if __name__=='__main__':
serialization_string_kernels_modular(*parameter_list[0])
# This example shows how to use boost serialization (only available if the compile flag was enabled)
# to serialize/deserialize an SVMLight object. Note that this code is in alpha state.
#!/usr/bin/env python
parameter_list=[[10, 1, 2.1, 2.0]]
def serialization_svmlight_modular (num, dist, width, C):
from modshogun import MSG_DEBUG
from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet
from modshogun import WeightedDegreeStringKernel, GaussianKernel
try:
from modshogun import SVMLight
except ImportError:
print("SVMLight not available")
exit(0)
from numpy import concatenate, ones
from numpy.random import randn, seed
import sys
import types
import random
import bz2
import pickle
import inspect
def save(filename, myobj):
"""
save object to file using pickle
@param filename: name of destination file
@type filename: str
@param myobj: object to save (has to be pickleable)
@type myobj: obj
"""
try:
f = bz2.BZ2File(filename, 'wb')
except IOError as details:
sys.stderr.write('File ' + filename + ' cannot be written\n')
sys.stderr.write(details)
return
pickle.dump(myobj, f, protocol=2)
f.close()
def load(filename):
"""
Load from filename using pickle
@param filename: name of file to load from
@type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except IOError as details:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = pickle.load(f)
f.close()
return myobj
##################################################
# set up toy data and svm
traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
testdata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);
trainlab = concatenate((-ones(num), ones(num)));
testlab = concatenate((-ones(num), ones(num)));
feats_train = RealFeatures(traindata_real);
feats_test = RealFeatures(testdata_real);
kernel = GaussianKernel(feats_train, feats_train, width);
#kernel.io.set_loglevel(MSG_DEBUG)
labels = BinaryLabels(trainlab);
svm = SVMLight(C, kernel, labels)
svm.train()
#svm.io.set_loglevel(MSG_DEBUG)
##################################################
# serialize to file
fn = "serialized_svm.bz2"
#print("serializing SVM to file", fn)
save(fn, svm)
##################################################
# unserialize and sanity check
#print("unserializing SVM")
svm2 = load(fn)
#print("comparing objectives")
svm2.train()
#print("objective before serialization:", svm.get_objective())
#print("objective after serialization:", svm2.get_objective())
#print("comparing predictions")
out = svm.apply(feats_test).get_labels()
out2 = svm2.apply(feats_test).get_labels()
# assert outputs are close
for i in range(len(out)):
assert abs(out[i] - out2[i] < 0.000001)
#print("all checks passed.")
return True
if __name__=='__main__':
print('Serialization SVMLight')
serialization_svmlight_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy as np
def gen_data(num_classes,num_samples,dim):
np.random.seed(0)
covs = np.array([[[0., -1. ], [2.5, .7]],
[[3., -1.5], [1.2, .3]],
[[ 2, 0 ], [ .0, 1.5 ]]])
X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]),
np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]),
np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])];
Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples)))
return X, Y
# Number of classes
M = 3
# Number of samples of each class
N = 50
# Dimension of the data
dim = 2
traindat, label_traindat = gen_data(M,N,dim)
parameter_list = [[traindat,label_traindat]]
def so_multiclass (fm_train_real=traindat,label_train_multiclass=label_traindat):
try:
from modshogun import RealFeatures
from modshogun import MulticlassModel, MulticlassSOLabels, PrimalMosekSOSVM, RealNumber
except ImportError:
print("Mosek not available")
return
labels = MulticlassSOLabels(label_train_multiclass)
features = RealFeatures(fm_train_real.T)
model = MulticlassModel(features, labels)
sosvm = PrimalMosekSOSVM(model, labels)
sosvm.train()
out = sosvm.apply()
count = 0
for i in xrange(out.get_num_labels()):
yi_pred = RealNumber.obtain_from_generic(out.get_label(i))
if yi_pred.value == label_train_multiclass[i]:
count = count + 1
print("Correct classification rate: %0.2f" % ( 100.0*count/out.get_num_labels() ))
if __name__=='__main__':
print('SO multiclass')
so_multiclass(*parameter_list[0])
# In this example, HSIC, a kernel-based test for independence is used to detect
# dependence of a mixture of Gaussians and a rotated version of the same data.
# The HSIC statistic is computed and available methods for computing a threshold
# of the null distribution are used. In addition, p-values of the test are
# computed. Note that these methods require more iterations than used here. A
# Gaussian kernel is selected via the median heuristic.
# See tutorial and Class documentation for more details.
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
import numpy as np
from math import pi
parameter_list = [[150,3,3]]
def statistics_hsic (n, difference, angle):
from modshogun import RealFeatures
from modshogun import DataGenerator
from modshogun import GaussianKernel
from modshogun import HSIC
from modshogun import PERMUTATION, HSIC_GAMMA
from modshogun import EuclideanDistance
from modshogun import Statistics, Math
# for reproducable results (the numpy one might not be reproducible across
# different OS/Python-distributions
Math.init_random(1)
np.random.seed(1)
# note that the HSIC has to store kernel matrices
# which upper bounds the sample size
# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
#plot(data[0], data[1], 'x');show()
# create shogun feature representation
features_x=RealFeatures(np.array([data[0]]))
features_y=RealFeatures(np.array([data[1]]))
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=np.random.permutation(features_x.get_num_vectors()).astype(np.int32)
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=np.median(distances)
sigma_x=median_distance**2
features_y.add_subset(subset)
dist=EuclideanDistance(features_y, features_y)
distances=dist.get_distance_matrix()
features_y.remove_subset()
median_distance=np.median(distances)
sigma_y=median_distance**2
#print "median distance for Gaussian kernel on x:", sigma_x
#print "median distance for Gaussian kernel on y:", sigma_y
kernel_x=GaussianKernel(10,sigma_x)
kernel_y=GaussianKernel(10,sigma_y)
hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05 using different methods to approximate
# null-distribution
statistic=hsic.compute_statistic()
#print "HSIC:", statistic
alpha=0.05
#print "computing p-value using sampling null"
hsic.set_null_approximation_method(PERMUTATION)
# normally, at least 250 iterations should be done, but that takes long
hsic.set_num_null_samples(100)
# sampling null allows usage of unbiased or biased statistic
p_value_boot=hsic.compute_p_value(statistic)
thresh_boot=hsic.compute_threshold(alpha)
#print "p_value:", p_value_boot
#print "threshold for 0.05 alpha:", thresh_boot
#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha
#print "computing p-value using gamma method"
hsic.set_null_approximation_method(HSIC_GAMMA)
p_value_gamma=hsic.compute_p_value(statistic)
thresh_gamma=hsic.compute_threshold(alpha)
#print "p_value:", p_value_gamma
#print "threshold for 0.05 alpha:", thresh_gamma
#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha
# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
# sampling null, biased statistic
#print "sampling null distribution using sample_null"
hsic.set_null_approximation_method(PERMUTATION)
hsic.set_num_null_samples(100)
null_samples=hsic.sample_null()
#print "null mean:", np.mean(null_samples)
#print "null variance:", np.var(null_samples)
#hist(null_samples, 100); show()
return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
if __name__=='__main__':
print('HSIC')
statistics_hsic(*parameter_list[0])
#!/usr/bin/env python
from numpy import *
from numpy import random
parameter_list = [[10,3]]
def statistics_kmm (n,d):
from modshogun import RealFeatures
from modshogun import DataGenerator
from modshogun import GaussianKernel, MSG_DEBUG
try:
from modshogun import KernelMeanMatching
except ImportError:
print("KernelMeanMatching not available")
exit(0)
from modshogun import Math
# init seed for reproducability
Math.init_random(1)
random.seed(1);
data = random.randn(d,n)
# create shogun feature representation
features=RealFeatures(data)
# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
kernel=GaussianKernel(10,8)
kernel.init(features,features)
kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
w = kmm.compute_weights()
#print w
return w
if __name__=='__main__':
print('KernelMeanMatching')
statistics_kmm(*parameter_list[0])
# In this example, the linear time MMD statistic for kernel-based two-sample
# testing is illustrated. It is a streaming based statistic for large amounts
# of data. The used dataset is a bunch of standard Gaussian vectors where the
# first dimensions differs in both distributions p and q. The test statistic
# is computed and available methods for computing a threshold of the null
# distribution are used. In addition, p-values for the test are computed.
# Note that these methods require more iterations/samples that used here. A
# Gaussian is selected via the median heuristic. There are more clever
# kernel selection methods available.
# See tutorial and Class documentation for more details.
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
parameter_list = [[1000,2,0.5]]
def statistics_linear_time_mmd (n,dim,difference):
from modshogun import RealFeatures
from modshogun import MeanShiftDataGenerator
from modshogun import GaussianKernel
from modshogun import LinearTimeMMD
from modshogun import PERMUTATION, MMD1_GAUSSIAN
from modshogun import EuclideanDistance
from modshogun import Statistics, Math
# init seed for reproducability
Math.init_random(1)
# note that the linear time statistic is designed for much larger datasets
# so increase to get reasonable results
# streaming data generator for mean shift distributions
gen_p=MeanShiftDataGenerator(0, dim)
gen_q=MeanShiftDataGenerator(difference, dim)
# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Stream examples and merge them in order to compute median on joint sample
features=gen_p.get_streamed_features(100)
features=features.create_merged_copy(gen_q.get_streamed_features(100))
# compute all pairwise distances
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
# compute median and determine kernel width
median_distance=median(distances)
sigma=median_distance**2
#print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)
# mmd instance using streaming features, blocksize of 10000
mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)
# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05
statistic=mmd.compute_statistic()
#print "test statistic:", statistic
# do the same thing using two different way to approximate null-dstribution
# sampling null and gaussian approximation (ony for really large samples)
alpha=0.05
#print "computing p-value using sampling null"
mmd.set_null_approximation_method(PERMUTATION)
mmd.set_num_null_samples(50) # normally, far more iterations are needed
p_value_boot=mmd.compute_p_value(statistic)
#print "p_value_boot:", p_value_boot
#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha
#print "computing p-value using gaussian approximation"
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
p_value_gaussian=mmd.compute_p_value(statistic)
#print "p_value_gaussian:", p_value_gaussian
#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha
# sample from null distribution (these may be plotted or whatsoever)
# mean should be close to zero, variance stronly depends on data/kernel
mmd.set_null_approximation_method(PERMUTATION)
mmd.set_num_null_samples(10) # normally, far more iterations are needed
null_samples=mmd.sample_null()
#print "null mean:", mean(null_samples)
#print "null variance:", var(null_samples)
# compute type I and type II errors for Gaussian approximation
# number of trials should be larger to compute tight confidence bounds
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
num_trials=5;
alpha=0.05 # test power
typeIerrors=[0 for x in range(num_trials)]
typeIIerrors=[0 for x in range(num_trials)]
for i in range(num_trials):
# this effectively means that p=q - rejecting is tpye I error
mmd.set_simulate_h0(True)
typeIerrors[i]=mmd.perform_test()>alpha
mmd.set_simulate_h0(False)
typeIIerrors[i]=mmd.perform_test()>alpha
#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
if __name__=='__main__':
print('LinearTimeMMD')
statistics_linear_time_mmd(*parameter_list[0])
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
#from pylab import *
parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "l2"]]
def statistics_mmd_kernel_selection_combined(m,distance,stretch,num_blobs,angle,selection_method):
from modshogun import RealFeatures
from modshogun import GaussianBlobsDataGenerator
from modshogun import GaussianKernel, CombinedKernel
from modshogun import LinearTimeMMD
try:
from modshogun import MMDKernelSelectionCombMaxL2
except ImportError:
print("MMDKernelSelectionCombMaxL2 not available")
exit(0)
try:
from modshogun import MMDKernelSelectionCombOpt
except ImportError:
print("MMDKernelSelectionCombOpt not available")
exit(0)
from modshogun import PERMUTATION, MMD1_GAUSSIAN
from modshogun import EuclideanDistance
from modshogun import Statistics, Math
# init seed for reproducability
Math.init_random(1)
# note that the linear time statistic is designed for much larger datasets
# results for this low number will be bad (unstable, type I error wrong)
# streaming data generator
gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)
# stream some data and plot
num_plot=1000
features=gen_p.get_streamed_features(num_plot)
features=features.create_merged_copy(gen_q.get_streamed_features(num_plot))
data=features.get_feature_matrix()
#figure()
#subplot(2,2,1)
#grid(True)
#plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
#title('$X\sim p$')
#subplot(2,2,2)
#grid(True)
#plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
#title('$Y\sim q$')
# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
# different to the standard form, see documentation)
sigmas=[2**x for x in range(-3,10)]
widths=[x*x*2 for x in sigmas]
combined=CombinedKernel()
for i in range(len(sigmas)):
combined.append_kernel(GaussianKernel(10, widths[i]))
# mmd instance using streaming features, blocksize of 10000
block_size=10000
mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)
# kernel selection instance (this can easily replaced by the other methods for selecting
# combined kernels
if selection_method=="opt":
selection=MMDKernelSelectionCombOpt(mmd)
elif selection_method=="l2":
selection=MMDKernelSelectionCombMaxL2(mmd)
# perform kernel selection (kernel is automatically set)
kernel=selection.select_kernel()
kernel=CombinedKernel.obtain_from_generic(kernel)
#print "selected kernel weights:", kernel.get_subkernel_weights()
#subplot(2,2,3)
#plot(kernel.get_subkernel_weights())
#title("Kernel weights")
# compute tpye I and II error (use many more trials). Type I error is only
# estimated to check MMD1_GAUSSIAN method for estimating the null
# distribution. Note that testing has to happen on difference data than
# kernel selecting, but the linear time mmd does this implicitly
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
# number of trials should be larger to compute tight confidence bounds
num_trials=5;
alpha=0.05 # test power
typeIerrors=[0 for x in range(num_trials)]
typeIIerrors=[0 for x in range(num_trials)]
for i in range(num_trials):
# this effectively means that p=q - rejecting is tpye I error
mmd.set_simulate_h0(True)
typeIerrors[i]=mmd.perform_test()>alpha
mmd.set_simulate_h0(False)
typeIIerrors[i]=mmd.perform_test()>alpha
#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
return kernel,typeIerrors,typeIIerrors
if __name__=='__main__':
print('MMDKernelSelectionCombined')
statistics_mmd_kernel_selection_combined(*parameter_list[0])
#show()
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
#from pylab import *
parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "max"], [1000,10,5,3,pi/4, "median"]]
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method):
from modshogun import RealFeatures
from modshogun import GaussianBlobsDataGenerator
from modshogun import GaussianKernel, CombinedKernel
from modshogun import LinearTimeMMD
from modshogun import MMDKernelSelectionMedian
from modshogun import MMDKernelSelectionMax
from modshogun import MMDKernelSelectionOpt
from modshogun import PERMUTATION, MMD1_GAUSSIAN
from modshogun import EuclideanDistance
from modshogun import Statistics, Math
# init seed for reproducability
Math.init_random(1)
# note that the linear time statistic is designed for much larger datasets
# results for this low number will be bad (unstable, type I error wrong)
m=1000
distance=10
stretch=5
num_blobs=3
angle=pi/4
# streaming data generator
gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)
# stream some data and plot
num_plot=1000
features=gen_p.get_streamed_features(num_plot)
features=features.create_merged_copy(gen_q.get_streamed_features(num_plot))
data=features.get_feature_matrix()
#figure()
#subplot(2,2,1)
#grid(True)
#plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
#title('$X\sim p$')
#subplot(2,2,2)
#grid(True)
#plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
#title('$Y\sim q$')
# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
# different to the standard form, see documentation)
sigmas=[2**x for x in range(-3,10)]
widths=[x*x*2 for x in sigmas]
combined=CombinedKernel()
for i in range(len(sigmas)):
combined.append_kernel(GaussianKernel(10, widths[i]))
# mmd instance using streaming features, blocksize of 10000
block_size=1000
mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)
# kernel selection instance (this can easily replaced by the other methods for selecting
# single kernels
if selection_method=="opt":
selection=MMDKernelSelectionOpt(mmd)
elif selection_method=="max":
selection=MMDKernelSelectionMax(mmd)
elif selection_method=="median":
selection=MMDKernelSelectionMedian(mmd)
# print measures (just for information)
# in case Opt: ratios of MMD and standard deviation
# in case Max: MMDs for each kernel
# Does not work for median method
if selection_method!="median":
ratios=selection.compute_measures()
#print "Measures:", ratios
#subplot(2,2,3)
#plot(ratios)
#title('Measures')
# perform kernel selection
kernel=selection.select_kernel()
kernel=GaussianKernel.obtain_from_generic(kernel)
#print "selected kernel width:", kernel.get_width()
# compute tpye I and II error (use many more trials). Type I error is only
# estimated to check MMD1_GAUSSIAN method for estimating the null
# distribution. Note that testing has to happen on difference data than
# kernel selecting, but the linear time mmd does this implicitly
mmd.set_kernel(kernel)
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
# number of trials should be larger to compute tight confidence bounds
num_trials=5;
alpha=0.05 # test power
typeIerrors=[0 for x in range(num_trials)]
typeIIerrors=[0 for x in range(num_trials)]
for i in range(num_trials):
# this effectively means that p=q - rejecting is tpye I error
mmd.set_simulate_h0(True)
typeIerrors[i]=mmd.perform_test()>alpha
mmd.set_simulate_h0(False)
typeIIerrors[i]=mmd.perform_test()>alpha
#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
return kernel,typeIerrors,typeIIerrors
if __name__=='__main__':
print('MMDKernelSelection')
statistics_mmd_kernel_selection_single(*parameter_list[0])
#show()
# In this example, the quadratic time MMD statistic for kernel-based two-sample
# testing is illustrated. It is a statistic for smaller amounts of data where
# one is interested to compute the best possible test. The used dataset is a
# bunch of standard Gaussian vectors where the first dimensions differs in both
# distributions p and q. The test statistic is computed and available methods
# for computing a threshold of the null distribution are used. In addition,
# p-values for the test are computed. Note that these methods require more
# iterations/samples that used here. A Gaussian is with a fixed kernel size is
# used. There are more clever kernel selection methods available.
# See tutorial and Class documentation for more details.
#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
import numpy as np
parameter_list = [[30,2,0.5]]
def statistics_quadratic_time_mmd (m,dim,difference):
from modshogun import RealFeatures
from modshogun import MeanShiftDataGenerator
from modshogun import GaussianKernel, CustomKernel
from modshogun import QuadraticTimeMMD
from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED
from modshogun import Statistics, IntVector, RealVector, Math
# for reproducable results (the numpy one might not be reproducible across
# different OS/Python-distributions
Math.init_random(1)
np.random.seed(1)
# number of examples kept low in order to make things fast
# streaming data generator for mean shift distributions
gen_p=MeanShiftDataGenerator(0, dim);
#gen_p.parallel.set_num_threads(1)
gen_q=MeanShiftDataGenerator(difference, dim);
# stream some data from generator
feat_p=gen_p.get_streamed_features(m);
feat_q=gen_q.get_streamed_features(m);
# set kernel a-priori. usually one would do some kernel selection. See
# other examples for this.
width=10;
kernel=GaussianKernel(10, width);
# create quadratic time mmd instance. Note that this constructor
# copies p and q and does not reference them
mmd=QuadraticTimeMMD(kernel, feat_p, feat_q);
# perform test: compute p-value and test if null-hypothesis is rejected for
# a test level of 0.05
alpha=0.05;
# using permutation (slow, not the most reliable way. Consider pre-
# computing the kernel when using it, see below).
# Also, in practice, use at least 250 iterations
mmd.set_null_approximation_method(PERMUTATION);
mmd.set_num_null_samples(3);
p_value_null=mmd.perform_test();
# reject if p-value is smaller than test level
#print "bootstrap: p!=q: ", p_value_null<alpha
# using spectrum method. Use at least 250 samples from null.
# This is consistent but sometimes breaks, always monitor type I error.
# See tutorial for number of eigenvalues to use .
mmd.set_statistic_type(BIASED);
mmd.set_null_approximation_method(MMD2_SPECTRUM);
mmd.set_num_eigenvalues_spectrum(3);
mmd.set_num_samples_spectrum(250);
p_value_spectrum=mmd.perform_test();
# reject if p-value is smaller than test level
#print "spectrum: p!=q: ", p_value_spectrum<alpha
# using gamma method. This is a quick hack, which works most of the time
# but is NOT guaranteed to. See tutorial for details.
# Only works with BIASED_DEPRECATED statistic
mmd.set_statistic_type(BIASED_DEPRECATED);
mmd.set_null_approximation_method(MMD2_GAMMA);
p_value_gamma=mmd.perform_test();
# reject if p-value is smaller than test level
#print "gamma: p!=q: ", p_value_gamma<alpha
# compute tpye I and II error (use many more trials in practice).
# Type I error is not necessary if one uses permutation. We do it here
# anyway, but note that this is an efficient way of computing it.
# Also note that testing has to happen on
# difference data than kernel selection, but the linear time mmd does this
# implicitly and we used a fixed kernel here.
mmd.set_statistic_type(BIASED);
mmd.set_null_approximation_method(PERMUTATION);
mmd.set_num_null_samples(5);
num_trials=5;
type_I_errors=np.zeros(num_trials)
type_II_errors=np.zeros(num_trials)
inds=np.array([x for x in range(2*m)], dtype=np.int32)
p_and_q=mmd.get_p_and_q();
# use a precomputed kernel to be faster
kernel.init(p_and_q, p_and_q);
precomputed=CustomKernel(kernel);
mmd.set_kernel(precomputed);
for i in range(num_trials):
# this effectively means that p=q - rejecting is tpye I error
inds=np.random.permutation(inds) # numpy permutation
precomputed.add_row_subset(inds);
precomputed.add_col_subset(inds);
type_I_errors[i]=mmd.perform_test()>alpha;
precomputed.remove_row_subset();
precomputed.remove_col_subset();
# on normal data, this gives type II error
type_II_errors[i]=mmd.perform_test()>alpha;
return type_I_errors,type_I_errors,p_value_null,p_value_spectrum,p_value_gamma,
if __name__=='__main__':
print('QuadraticTimeMMD')
statistics_quadratic_time_mmd(*parameter_list[0])
#!/usr/bin/env python
import numpy as np
traindat = '../../../data/uci/housing/fm_housing.dat'
label_traindat = '../../../data/uci/housing/housing_label.dat'
# set both input attributes as nominal (True) / continuous (False)
feat_types=np.array([False,False,False,True,False,False,False,False,False,False,False,False,False])
parameter_list = [[traindat,label_traindat,feat_types]]
def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types):
try:
from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss
except ImportError:
print("Could not import Shogun modules")
return
# wrap features and labels into Shogun objects
feats=RealFeatures(CSVFile(train))
labels=RegressionLabels(CSVFile(train_labels))
# divide into training (90%) and test dataset (10%)
p=np.random.permutation(labels.get_num_labels())
num=labels.get_num_labels()*0.9
cart=CARTree()
cart.set_feature_types(ft)
cart.set_max_depth(1)
loss=SquaredLoss()
s=StochasticGBMachine(cart,loss,500,0.01,0.6)
# train
feats.add_subset(np.int32(p[0:num]))
labels.add_subset(np.int32(p[0:num]))
s.set_labels(labels)
s.train(feats)
feats.remove_subset()
labels.remove_subset()
# apply
feats.add_subset(np.int32(p[num:len(p)]))
labels.add_subset(np.int32(p[num:len(p)]))
output=s.apply_regression(feats)
feats.remove_subset()
labels.remove_subset()
return s,output
if __name__=='__main__':
print('StochasticGBMachine')
stochasticgbmachine_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import StreamingVwFile
from modshogun import StreamingVwCacheFile
from modshogun import T_SVMLIGHT
from modshogun import StreamingVwFeatures
from modshogun import VowpalWabbit
parameter_list=[['../data/fm_train_sparsereal.dat']]
def streaming_vw_createcache_modular (fname):
# First creates a binary cache from an ascii data file.
# and then trains using the StreamingVwCacheFile as input
# Open the input file as a StreamingVwFile
input_file = StreamingVwFile(fname)
# Default file name will be vw_cache.dat.cache
input_file.set_write_to_cache(True)
# Tell VW that the file is in SVMLight format
# Supported types are T_DENSE, T_SVMLIGHT and T_VW
input_file.set_parser_type(T_SVMLIGHT)
## Create a StreamingVwFeatures object, `True' indicating the examples are labelled
#features = StreamingVwFeatures(input_file, True, 1024)
## Create a VW object from the features
#vw = VowpalWabbit(features)
#vw.set_no_training(True)
## Train (in this case does nothing but run over all examples)
#vw.train()
##Finally Train using the generated cache file
## Open the input cache file as a StreamingVwCacheFile
#input_file = StreamingVwCacheFile("vw_cache.dat.cache");
## The rest is exactly as for normal input
#features = StreamingVwFeatures(input_file, True, 1024);
#vw = VowpalWabbit(features)
#vw.train()
##return vw
if __name__ == "__main__":
streaming_vw_createcache_modular(*parameter_list[0])
#!/usr/bin/env python
from modshogun import StreamingVwFile
from modshogun import T_SVMLIGHT
from modshogun import StreamingVwFeatures
from modshogun import VowpalWabbit
parameter_list=[[None]]
def streaming_vw_modular (dummy):
"""Runs the VW algorithm on a toy dataset in SVMLight format."""
# Open the input file as a StreamingVwFile
input_file = StreamingVwFile("../data/fm_train_sparsereal.dat")
# Tell VW that the file is in SVMLight format
# Supported types are T_DENSE, T_SVMLIGHT and T_VW
input_file.set_parser_type(T_SVMLIGHT)
## Create a StreamingVwFeatures object, `True' indicating the examples are labelled
#features = StreamingVwFeatures(input_file, True, 1024)
## Create a VW object from the features
#vw = VowpalWabbit(features)
## Train
#vw.train()
##return vw
if __name__ == "__main__":
streaming_vw_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy
import scipy
from scipy import io
data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False)
parameter_list=[[data_dict]]
def structure_discrete_hmsvm_bmrm (m_data_dict=data_dict):
from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel
from modshogun import StructuredAccuracy, SMT_TWO_STATE
try:
from modshogun import DualLibQPBMSOSVM
except ImportError:
print("DualLibQPBMSOSVM not available")
exit(0)
labels_array = m_data_dict['label'][0]
idxs = numpy.nonzero(labels_array == -1)
labels_array[idxs] = 0
labels = SequenceLabels(labels_array, 250, 500, 2)
features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500)
num_obs = 4 # given by the data file used
model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs)
sosvm = DualLibQPBMSOSVM(model, labels, 5000.0)
sosvm.train()
#print sosvm.get_w()
predicted = sosvm.apply(features)
evaluator = StructuredAccuracy()
acc = evaluator.evaluate(predicted, labels)
#print('Accuracy = %.4f' % acc)
if __name__ == '__main__':
print("Discrete HMSVM BMRM")
structure_discrete_hmsvm_bmrm(*parameter_list[0])
#!/usr/bin/env python
import numpy
import scipy
from scipy import io
data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False)
parameter_list=[[data_dict]]
def structure_discrete_hmsvm_mosek (m_data_dict=data_dict):
from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel
from modshogun import StructuredAccuracy, SMT_TWO_STATE
try:
from modshogun import PrimalMosekSOSVM
except ImportError:
print("Mosek not available")
return
labels_array = m_data_dict['label'][0]
idxs = numpy.nonzero(labels_array == -1)
labels_array[idxs] = 0
labels = SequenceLabels(labels_array, 250, 500, 2)
features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500)
num_obs = 4 # given by the data file used
model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs)
sosvm = PrimalMosekSOSVM(model, labels)
sosvm.train()
#print(sosvm.get_w())
predicted = sosvm.apply()
evaluator = StructuredAccuracy()
acc = evaluator.evaluate(predicted, labels)
#print('Accuracy = %.4f' % acc)
if __name__ == '__main__':
print("Discrete HMSVM Mosek")
structure_discrete_hmsvm_mosek(*parameter_list[0])
# In this example we use the dynamic progaramm implementation with a
# gene finding specific model. The model and the training parameter
# are stored in a file and are used to create a gene prediction on
# some example sequence.
#!/usr/bin/env python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
parameter_list=[['../data/DynProg_example_py.pickle.gz']]
from modshogun import *
import numpy
from numpy import array,Inf,float64,matrix,frompyfunc,zeros
#from IPython.Shell import IPShellEmbed
#ipshell = IPShellEmbed()
import gzip
import scipy
from scipy.io import loadmat
import pickle
try:
from StringIO import StringIO
except ImportError:
from io import BytesIO as StringIO
def get_ver(ver_str):
scipy_ver=[int(i) for i in scipy.__version__.split('.')]
v=0
for i in range(len(scipy_ver)):
v+=10**(len(scipy_ver)-i)*scipy_ver[i]
return v
if get_ver(scipy.__version__) >= get_ver('0.7.0'):
renametable = {
'scipy.io.mio5': 'scipy.io.matlab.mio5',
'scipy.sparse.sparse' : 'scipy.sparse',
}
else:
renametable = {}
def mapname(name):
if name in renametable:
return renametable[name]
return name
# scipy compatibility class
class mat_struct(object):
pass
def mapped_load_global(self):
module = mapname(self.readline()[:-1])
name = mapname(self.readline()[:-1])
if name=='mat_struct':
klass=mat_struct
else:
klass = self.find_class(module, name)
self.append(klass)
def loads(str):
file = StringIO(str)
unpickler = pickle.Unpickler(file)
unpickler.dispatch[pickle.GLOBAL] = mapped_load_global
return unpickler.load()
def structure_dynprog_modular (fname):
import sys
#pickle is not compatible between python2 -> 3
if sys.version_info[0]>2:
return
data_dict = loads(gzip.GzipFile(fname).read())
#data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False)
#print(data_dict)
#print(len(data_dict['penalty_array'][0][0][0][0].limits[0]))
num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits)
pm = PlifMatrix()
pm.create_plifs(num_plifs,num_limits)
ids = numpy.array(list(range(num_plifs)),dtype=numpy.int32)
min_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64)
max_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64)
all_use_cache = numpy.array(list(range(num_plifs)),dtype=numpy.bool)
all_use_svm = numpy.array(list(range(num_plifs)),dtype=numpy.int32)
all_limits = zeros((num_plifs,num_limits))
all_penalties = zeros((num_plifs,num_limits))
all_names = ['']*num_plifs
all_transforms = ['']*num_plifs
for plif_idx in range(num_plifs):
ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1
min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value
max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value
all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache
all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm
all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits
all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties
all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name)
all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform)
if all_transforms[plif_idx] == '[]':
all_transforms[plif_idx] = 'linear'
pm.set_plif_ids(ids)
pm.set_plif_min_values(min_values)
pm.set_plif_max_values(max_values)
pm.set_plif_use_cache(all_use_cache)
pm.set_plif_use_svm(all_use_svm)
pm.set_plif_limits(all_limits)
pm.set_plif_penalties(all_penalties)
#pm.set_plif_names(all_names)
#pm.set_plif_transform_type(all_transforms)
transition_ptrs = data_dict['model'].transition_pointers
transition_ptrs = transition_ptrs[:,:,0:2]
transition_ptrs = transition_ptrs.astype(numpy.float64)
pm.compute_plif_matrix(transition_ptrs)
# init_dyn_prog
num_svms = 8
dyn = DynProg(num_svms)
orf_info = data_dict['model'].orf_info
orf_info = orf_info.astype(numpy.int32)
num_states = orf_info.shape[0]
dyn.set_num_states(num_states)
block = data_dict['block']
seq_len = len(block.seq)
seq = str(block.seq)
gene_string = array([elem for elem in seq])
# precompute_content_svms
pos = block.all_pos-1
pos = pos.astype(numpy.int32)
snd_pos = pos
dyn.set_pos(pos)
dyn.set_gene_string(gene_string)
dyn.create_word_string()
dyn.precompute_stop_codons()
dyn.init_content_svm_value_array(num_svms)
dict_weights = data_dict['content_weights']
dict_weights = dict_weights.reshape(8,1).astype(numpy.float64)
dict_weights = zeros((8,5440))
dyn.set_dict_weights(dict_weights.T)
dyn.precompute_content_values()
dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32))
pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32))
dyn.set_orf_info(orf_info)
#
p = data_dict['model'].p
q = data_dict['model'].q
dyn.set_p_vector(p)
dyn.set_q_vector(q)
a_trans = data_dict['a_trans']
a_trans = a_trans.astype(float64)
dyn.set_a_trans_matrix(a_trans)
dyn.check_svm_arrays()
features = data_dict['block'].features
dyn.set_observation_matrix(features)
dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64))
dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64))
use_orf = True
feat_dims = [25,201,2]
dyn.set_plif_matrices(pm);
#dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False)
## fetch results
#states = dyn.get_states()
##print(states)
#scores = dyn.get_scores()
##print(scores)
#positions = dyn.get_positions()
##print(positions)
#return states, scores, positions
if __name__ == '__main__':
print("Structure")
structure_dynprog_modular(*parameter_list[0])
#!/usr/bin/env python
import numpy as np
from modshogun import TableFactorType
# create the factor type with GT parameters
tid = 0
cards = np.array([2,2], np.int32)
w_gt = np.array([0.3,0.5,1.0,0.2,0.05,0.6,-0.2,0.75])
fac_type = TableFactorType(tid, cards, w_gt)
tid_u = 1
cards_u = np.array([2], np.int32)
w_gt_u = np.array([0.5,0.8,1.0,-0.3])
fac_type_u = TableFactorType(tid_u, cards_u, w_gt_u)
tid_b = 2
cards_b = np.array([2], np.int32)
w_gt_b = np.array([0.8, -0.8])
fac_type_b = TableFactorType(tid_b, cards_b, w_gt_b)
def gen_data(ftype, num_samples, show_data = False):
from modshogun import Math
from modshogun import FactorType, Factor, TableFactorType, FactorGraph
from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
from modshogun import MAPInference, TREE_MAX_PROD
Math.init_random(17)
samples = FactorGraphFeatures(num_samples)
labels = FactorGraphLabels(num_samples)
for i in range(num_samples):
vc = np.array([2,2,2], np.int32)
fg = FactorGraph(vc)
data1 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
vind1 = np.array([0,1], np.int32)
fac1 = Factor(ftype[0], vind1, data1)
fg.add_factor(fac1)
data2 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
vind2 = np.array([1,2], np.int32)
fac2 = Factor(ftype[0], vind2, data2)
fg.add_factor(fac2)
data3 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
vind3 = np.array([0], np.int32)
fac3 = Factor(ftype[1], vind3, data3)
fg.add_factor(fac3)
data4 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
vind4 = np.array([1], np.int32)
fac4 = Factor(ftype[1], vind4, data4)
fg.add_factor(fac4)
data5 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
vind5 = np.array([2], np.int32)
fac5 = Factor(ftype[1], vind5, data5)
fg.add_factor(fac5)
data6 = np.array([1.0])
vind6 = np.array([0], np.int32)
fac6 = Factor(ftype[2], vind6, data6)
fg.add_factor(fac6)
data7 = np.array([1.0])
vind7 = np.array([2], np.int32)
fac7 = Factor(ftype[2], vind7, data7)
fg.add_factor(fac7)
samples.add_sample(fg)
fg.connect_components()
fg.compute_energies()
infer_met = MAPInference(fg, TREE_MAX_PROD)
infer_met.inference()
fg_obs = infer_met.get_structured_outputs()
labels.add_label(fg_obs)
if show_data:
state = fg_obs.get_data()
print(state)
return samples, labels
w_all = [w_gt,w_gt_u,w_gt_b]
ftype_all = [fac_type,fac_type_u,fac_type_b]
num_samples = 10
samples, labels = gen_data(ftype_all, num_samples)
parameter_list = [[samples,labels,w_all,ftype_all]]
def structure_factor_graph_model(tr_samples = samples, tr_labels = labels, w = w_all, ftype = ftype_all):
from modshogun import SOSVMHelper, LabelsFactory
from modshogun import FactorGraphModel, MAPInference, TREE_MAX_PROD
from modshogun import StochasticSOSVM, FWSOSVM
try:
from modshogun import DualLibQPBMSOSVM
except ImportError:
print("DualLibQPBMSOSVM not available")
exit(0)
# create model
model = FactorGraphModel(tr_samples, tr_labels, TREE_MAX_PROD, False)
w_truth = [w[0].copy(), w[1].copy(), w[2].copy()]
w[0] = np.zeros(8)
w[1] = np.zeros(4)
w[2] = np.zeros(2)
ftype[0].set_w(w[0])
ftype[1].set_w(w[1])
ftype[2].set_w(w[2])
model.add_factor_type(ftype[0])
model.add_factor_type(ftype[1])
model.add_factor_type(ftype[2])
# --- training with BMRM ---
bmrm = DualLibQPBMSOSVM(model, tr_labels, 0.01)
#bmrm.set_verbose(True)
bmrm.train()
#print 'learned weights:'
#print bmrm.get_w()
#print 'ground truth weights:'
#print w_truth
# evaluation
lbs_bmrm = bmrm.apply()
acc_loss = 0.0
ave_loss = 0.0
for i in range(num_samples):
y_pred = lbs_bmrm.get_label(i)
y_truth = tr_labels.get_label(i)
acc_loss = acc_loss + model.delta_loss(y_truth, y_pred)
ave_loss = acc_loss / num_samples
#print('BMRM: Average training error is %.4f' % ave_loss)
# show primal objs and dual objs
#hbm = bmrm.get_helper()
#print hbm.get_primal_values()
#print hbm.get_eff_passes()
#print hbm.get_train_errors()
# --- training with SGD ---
sgd = StochasticSOSVM(model, tr_labels)
#sgd.set_verbose(True)
sgd.set_lambda(0.01)
sgd.train()
# evaluation
#print('SGD: Average training error is %.4f' % SOSVMHelper.average_loss(sgd.get_w(), model))
#hp = sgd.get_helper()
#print hp.get_primal_values()
#print hp.get_eff_passes()
#print hp.get_train_errors()
# --- training with FW ---
fw = FWSOSVM(model, tr_labels)
#fw.set_verbose(True)
fw.set_lambda(0.01)
fw.set_gap_threshold(0.01)
fw.train()
# evaluation
#print('FW: Average training error is %.4f' % SOSVMHelper.average_loss(fw.get_w(), model))
#hp = fw.get_helper()
#print hp.get_primal_values()
#print hp.get_dual_values()
#print hp.get_eff_passes()
#print hp.get_train_errors()
if __name__ == '__main__':
print("Factor Graph Model")
structure_factor_graph_model(*parameter_list[0])
#!/usr/bin/env python
import numpy as np
import itertools
from modshogun import Factor, TableFactorType, FactorGraph
from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
from modshogun import FactorGraphModel, GRAPH_CUT
from modshogun import GraphCut
from modshogun import StochasticSOSVM
def generate_data(num_train_samples, len_label, len_feat):
""" Generate synthetic dataset
Generate random data following [1]:
Each example has exactly one label on.
Each label has 40 related binary features.
For an example, if label i is on, 4i randomly chosen features are set to 1
[1] Finley, Thomas, and Thorsten Joachims.
"Training structural SVMs when exact inference is intractable."
Proceedings of the 25th international conference on Machine learning. ACM, 2008.
Args:
num_train_samples: number of samples
len_label: label length (10)
len_feat: feature length (40)
Returns:
feats: generated feature matrix
labels: generated label matrix
"""
labels = np.zeros((num_train_samples, len_label), np.int32)
feats = np.zeros((num_train_samples, len_feat), np.int32)
for k in range(num_train_samples):
i = k % len_label
labels[k, i] = 1
inds_one = np.random.permutation(range(len_feat))
inds_one = inds_one[:4*(i+1)]
for j in inds_one:
feats[k, j] = 1
return (labels, feats)
def define_factor_types(num_vars, len_feat, edge_table):
""" Define factor types
Args:
num_vars: number of variables in factor graph
len_feat: length of the feature vector
edge_table: edge table defines pair-wise node indeces
Returns:
v_factor_types: list of all unary and pair-wise factor types
"""
n_stats = 2 # for binary status
v_factor_types = {}
n_edges = edge_table.shape[0]
# unary factors
cards_u = np.array([n_stats], np.int32)
w_u = np.zeros(n_stats*len_feat)
for i in range(num_vars):
v_factor_types[i] = TableFactorType(i, cards_u, w_u)
# pair-wise factors
cards_pw = np.array([n_stats, n_stats], np.int32)
w_pw = np.zeros(n_stats*n_stats)
for j in range(n_edges):
v_factor_types[j + num_vars] = TableFactorType(j + num_vars, cards_pw, w_pw)
return v_factor_types
def build_factor_graph_model(labels, feats, factor_types, edge_table, infer_alg = GRAPH_CUT):
""" Build factor graph model
Args:
labels: matrix of labels [num_train_samples*len_label]
feats: maxtrix of feats [num_train_samples*len_feat]
factory_types: vectors of all factor types
edge_table: matrix of pairwised edges, each row is a pair of node indeces
infer_alg: inference algorithm (GRAPH_CUT)
Returns:
labels_fg: matrix of labels in factor graph format
feats_fg: matrix of features in factor graph format
"""
labels = labels.astype(np.int32)
num_train_samples = labels.shape[0]
num_vars = labels.shape[1]
num_edges = edge_table.shape[0]
n_stats = 2
feats_fg = FactorGraphFeatures(num_train_samples)
labels_fg = FactorGraphLabels(num_train_samples)
for i in range(num_train_samples):
cardinaities = np.array([n_stats]*num_vars, np.int32)
fg = FactorGraph(cardinaities)
# add unary factors
for u in range(num_vars):
data_u = np.array(feats[i,:], np.float64)
inds_u = np.array([u], np.int32)
factor_u = Factor(factor_types[u], inds_u, data_u)
fg.add_factor(factor_u)
# add pairwise factors
for v in range(num_edges):
data_p = np.array([1.0])
inds_p = np.array(edge_table[v, :], np.int32)
factor_p = Factor(factor_types[v + num_vars], inds_p, data_p)
fg.add_factor(factor_p)
# add factor graph
feats_fg.add_sample(fg)
# add corresponding label
loss_weights = np.array([1.0/num_vars]*num_vars)
fg_obs = FactorGraphObservation(labels[i,:], loss_weights)
labels_fg.add_label(fg_obs)
return (labels_fg, feats_fg)
def evaluation(labels_pr, labels_gt, model):
""" Evaluation
Args:
labels_pr: predicted label
labels_gt: ground truth label
model: factor graph model
Returns:
ave_loss: average loss
"""
num_train_samples = labels_pr.get_num_labels()
acc_loss = 0.0
ave_loss = 0.0
for i in range(num_train_samples):
y_pred = labels_pr.get_label(i)
y_truth = labels_gt.get_label(i)
acc_loss = acc_loss + model.delta_loss(y_truth, y_pred)
ave_loss = acc_loss / num_train_samples
return ave_loss
def graphcuts_sosvm(num_train_samples = 10, len_label = 5, len_feat = 20, num_test_samples = 5):
""" Graph cuts as approximate inference in structured output SVM framework.
Args:
num_train_samples: number of training samples
len_label: number of classes, i.e., size of label space
len_feat: the dimension of the feature vector
num_test_samples: number of testing samples
"""
import time
# generate synthetic dataset
(labels_train, feats_train) = generate_data(num_train_samples, len_label, len_feat)
# compute full-connected edge table
full = np.vstack([x for x in itertools.combinations(range(len_label), 2)])
# define factor types
factor_types = define_factor_types(len_label, len_feat, full)
# create features and labels for factor graph mode
(labels_fg, feats_fg) = build_factor_graph_model(labels_train, feats_train, factor_types, full, GRAPH_CUT)
# create model and register factor types
model = FactorGraphModel(feats_fg, labels_fg, GRAPH_CUT)
for i in range(len(factor_types)):
model.add_factor_type(factor_types[i])
# Training
# the 3rd parameter is do_weighted_averaging, by turning this on,
# a possibly faster convergence rate may be achieved.
# the 4th parameter controls outputs of verbose training information
sgd = StochasticSOSVM(model, labels_fg, True, True)
sgd.set_num_iter(150)
sgd.set_lambda(0.0001)
# train
t0 = time.time()
sgd.train()
t1 = time.time()
w_sgd = sgd.get_w()
#print "SGD took", t1 - t0, "seconds."
# training error
labels_pr = sgd.apply()
ave_loss = evaluation(labels_pr, labels_fg, model)
#print('SGD: Average training error is %.4f' % ave_loss)
# testing error
# generate synthetic testing dataset
(labels_test, feats_test) = generate_data(num_test_samples, len_label, len_feat)
# create features and labels for factor graph mode
(labels_fg_test, feats_fg_test) = build_factor_graph_model(labels_test, feats_test, factor_types, full, GRAPH_CUT)
# set features and labels to sgd
sgd.set_features(feats_fg_test)
sgd.set_labels(labels_fg_test)
# test
labels_pr = sgd.apply()
ave_loss = evaluation(labels_pr, labels_fg_test, model)
#print('SGD: Average testing error is %.4f' % ave_loss)
def graphcuts_general():
""" Graph cuts for general s-t graph optimization.
"""
num_nodes = 5
num_edges = 6
g = GraphCut(num_nodes, num_edges)
# add termainal-connected edges
# i.e., SOURCE->node_i and node_i->SINK
g.add_tweights(0, 4, 0)
g.add_tweights(1, 2, 0)
g.add_tweights(2, 8, 0)
g.add_tweights(2, 0, 4)
g.add_tweights(3, 0, 7)
g.add_tweights(4, 0, 5)
# add node to node edges
g.add_edge(0, 2, 5, 0)
g.add_edge(0, 3, 2, 0)
g.add_edge(1, 2, 6, 0)
g.add_edge(1, 4, 9, 0)
g.add_edge(2, 3, 1, 0)
g.add_edge(2, 4, 3, 0)
# initialize max-flow algorithm
g.init_maxflow()
# compute max flow
flow = g.compute_maxflow()
#print("Flow = %f" % flow)
# print assignment
#for i in xrange(num_nodes):
# print("\nNode %d = %d" % (i, g.get_assignment(i)))
test_general = True
test_sosvm = True
parameter_list = [[test_general, test_sosvm]]
def structure_graphcuts(test_general=True, test_sosvm=True):
""" Test graph cuts.
Args:
test_general: test graph cuts for general s-t graph optimization
test_sosvm: test graph cuts for structured output svm
"""
if test_general:
graphcuts_general()
if test_sosvm:
graphcuts_sosvm()
if __name__ == '__main__':
print("Graph cuts")
structure_graphcuts(*parameter_list[0])
#!/usr/bin/env python
"""
This examples shows how to use HierarchicalMultilabelModel for hierarchical
multi-label classification. The data used:
[1] Image CLEF 2007 competition for annotation of X-Ray images.
http://kt.ijs.si/DragiKocev/PhD/resources/doku.php?id=hmc_classification#imageclef07d
"""
from modshogun import MultilabelSOLabels, HierarchicalMultilabelModel
from modshogun import RealFeatures
from modshogun import StochasticSOSVM
from modshogun import StructuredAccuracy, LabelsFactory
import numpy as np
import time
train_file_name = '../../../data/multilabel/image_clef_train.arff'
test_file_name = '../../../data/multilabel/image_clef_test.arff'
parameter_list = [[train_file_name, test_file_name]]
def get_taxonomy(labels):
"""
Converting the labels to shogun compatible format
(i.e. 0, 1, ... num_classes - 1) and getting taxonomy of the labels
"""
labels = labels.split(',')
num_labels = len(labels)
# taking the root label into consideration
num_labels += 1
shogun_labels = dict()
taxonomy = np.zeros(num_labels, dtype=np.int32)
# considering the root_label node index to be 0
taxonomy[0] = -1
for i, label in enumerate(labels):
shogun_labels[label] = i + 1
try:
parent_label = label[:-2]
parent_idx = labels.index(parent_label) + 1
taxonomy[i + 1] = parent_idx
except ValueError:
taxonomy[i + 1] = 0
return shogun_labels, taxonomy
def get_data_sample(data_sample, shogun_labels):
"""
Extracting features and labels from a single row of data
"""
data = data_sample.split(',')
features = np.array(data[:-1], dtype=np.float64)
labs = data[-1].split('@')
# adding the root label
labels = np.zeros(len(labs) + 1, dtype=np.int32)
labels[0] = 0
for i, label in enumerate(labs):
labels[i + 1] = shogun_labels[label]
labels.sort()
return features, labels
def get_data(data, shogun_labels):
"""
Creating features and labels from the data samples
"""
num_samples = len(data)
# considering the root label
num_classes = len(shogun_labels) + 1
labels = MultilabelSOLabels(num_samples, num_classes)
for i, data_sample in enumerate(data):
feats, labs = get_data_sample(data_sample, shogun_labels)
try:
features = np.c_[features, feats]
except NameError:
features = feats
labels.set_sparse_label(i, labs)
return RealFeatures(features), labels
def get_features_labels(input_file):
"""
Creating features and labels from the input file (train/test file)
"""
train_file_lines = list(map(lambda x: x.strip(), input_file.readlines()))
all_labels = list(filter(lambda x: 'hierarchical' in x.strip(),
train_file_lines))[0].split()[-1]
shogun_labels, taxonomy = get_taxonomy(all_labels)
data_index = train_file_lines.index('@DATA')
features, labels = get_data(train_file_lines[data_index + 1:],
shogun_labels)
return features, labels, taxonomy
def structure_hierarchical_multilabel_classification(train_file_name,
test_file_name):
train_file = open(train_file_name)
test_file = open(test_file_name)
train_features, train_labels, train_taxonomy = get_features_labels(
train_file)
model = HierarchicalMultilabelModel(train_features, train_labels,
train_taxonomy)
sgd = StochasticSOSVM(model, train_labels)
t1 = time.time()
sgd.train()
print('>>> Took %f time for training' % (time.time() - t1))
test_features, test_labels, test_taxonomy = get_features_labels(test_file)
assert(test_taxonomy.all() == train_taxonomy.all())
evaluator = StructuredAccuracy()
outlabel = LabelsFactory.to_structured(sgd.apply(test_features))
print('>>> Accuracy of classification = %f' % evaluator.evaluate(
outlabel, test_labels))
if __name__ == '__main__':
print('Hierarchical Multilabel Classification')
structure_hierarchical_multilabel_classification(*parameter_list[0])
#!/usr/bin/env python
import numpy as np
def gen_data(num_classes,num_samples,dim):
np.random.seed(0)
covs = np.array([[[0., -1. ], [2.5, .7]],
[[3., -1.5], [1.2, .3]],
[[ 2, 0 ], [ .0, 1.5 ]]])
X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]),
np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]),
np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])];
Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples)))
return X, Y
# Number of classes
M = 3
# Number of samples of each class
N = 50
# Dimension of the data
dim = 2
traindat, label_traindat = gen_data(M,N,dim)
parameter_list = [[traindat,label_traindat]]
def structure_multiclass_bmrm(fm_train_real=traindat,label_train_multiclass=label_traindat):
from modshogun import MulticlassSOLabels, LabelsFactory
from modshogun import RealFeatures
from modshogun import SOSVMHelper
try:
from modshogun import BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM
except ImportError:
print("At least one of BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM not available")
exit(0)
from modshogun import MulticlassModel, RealNumber
labels = MulticlassSOLabels(label_train_multiclass)
features = RealFeatures(fm_train_real.T)
model = MulticlassModel(features, labels)
sosvm = DualLibQPBMSOSVM(model, labels, 1.0)
# BMRM
sosvm.set_solver(BMRM)
sosvm.set_verbose(True)
sosvm.train()
bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
count = 0
for i in range(bmrm_out.get_num_labels()):
yi_pred = RealNumber.obtain_from_generic(bmrm_out.get_label(i))
if yi_pred.value == label_train_multiclass[i]:
count = count + 1
#print("BMRM: Correct classification rate: %0.2f" % ( 100.0*count/bmrm_out.get_num_labels() ))
#hp = sosvm.get_helper()
#print hp.get_primal_values()
#print hp.get_train_errors()
# PPBMRM
w = np.zeros(model.get_dim())
sosvm.set_w(w)
sosvm.set_solver(PPBMRM)
sosvm.set_verbose(True)
sosvm.train()
ppbmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
count = 0
for i in range(ppbmrm_out.get_num_labels()):
yi_pred = RealNumber.obtain_from_generic(ppbmrm_out.get_label(i))
if yi_pred.value == label_train_multiclass[i]:
count = count + 1
#print("PPBMRM: Correct classification rate: %0.2f" % ( 100.0*count/ppbmrm_out.get_num_labels() ))
# P3BMRM
w = np.zeros(model.get_dim())
sosvm.set_w(w)
sosvm.set_solver(P3BMRM)
sosvm.set_verbose(True)
sosvm.train()
p3bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
count = 0
for i in range(p3bmrm_out.get_num_labels()):
yi_pred = RealNumber.obtain_from_generic(p3bmrm_out.get_label(i))
if yi_pred.value == label_train_multiclass[i]:
count = count + 1
#print("P3BMRM: Correct classification rate: %0.2f" % ( 100.0*count/p3bmrm_out.get_num_labels() ))
return bmrm_out, ppbmrm_out, p3bmrm_out
if __name__=='__main__':
print('SO multiclass model with bundle methods')
a,b,c=structure_multiclass_bmrm(*parameter_list[0])
#!/usr/bin/env python
parameter_list=[[50, 125, 10, 2]]
def structure_plif_hmsvm_bmrm (num_examples, example_length, num_features, num_noise_features):
from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy
try:
from modshogun import DualLibQPBMSOSVM
except ImportError:
print("DualLibQPBMSOSVM not available")
exit(0)
model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features)
sosvm = DualLibQPBMSOSVM(model, model.get_labels(), 5000.0)
sosvm.set_store_train_info(False)
sosvm.train()
#print sosvm.get_w()
predicted = sosvm.apply(model.get_features())
evaluator = StructuredAccuracy()
acc = evaluator.evaluate(predicted, model.get_labels())
#print('Accuracy = %.4f' % acc)
if __name__ == '__main__':
print("PLiF HMSVM BMRM")
structure_plif_hmsvm_bmrm(*parameter_list[0])
#!/usr/bin/env python
parameter_list=[[100, 250, 10, 2]]
def structure_plif_hmsvm_mosek (num_examples, example_length, num_features, num_noise_features):
from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy
try:
from modshogun import PrimalMosekSOSVM
except ImportError:
print("Mosek not available")
return
model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features)
sosvm = PrimalMosekSOSVM(model, model.get_labels())
sosvm.train()
#print(sosvm.get_w())
predicted = sosvm.apply(model.get_features())
evaluator = StructuredAccuracy()
acc = evaluator.evaluate(predicted, model.get_labels())
#print('Accuracy = %.4f' % acc)
if __name__ == '__main__':
print("PLiF HMSVM Mosek")
structure_plif_hmsvm_mosek(*parameter_list[0])
#!/usr/bin/env python
parameter_list=[[10,7,0,False]]
def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse):
import gc
from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
from modshogun import SortWordString, MSG_DEBUG
from modshogun import CommWordStringKernel, IdentityKernelNormalizer
from numpy import mat
POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
for i in range(10):
alpha=Alphabet(DNA)
traindat=StringCharFeatures(alpha)
traindat.set_features(POS+NEG)
trainudat=StringWordFeatures(traindat.get_alphabet());
trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
#trainudat.io.set_loglevel(MSG_DEBUG)
pre = SortWordString()
#pre.io.set_loglevel(MSG_DEBUG)
pre.init(trainudat)
trainudat.add_preprocessor(pre)
trainudat.apply_preprocessor()
spec = CommWordStringKernel(10, False)
spec.set_normalizer(IdentityKernelNormalizer())
spec.init(trainudat, trainudat)
K=spec.get_kernel_matrix()
del POS
del NEG
del order
del gap
del reverse
return K
if __name__=='__main__':
print('Leak Check Comm Word Kernel')
tests_check_commwordkernel_memleak_modular(*parameter_list[0])
#!/usr/bin/env python
from numpy import array,hstack,sin,cos
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
try:
from modshogun import MultitaskClusteredLogisticRegression
except ImportError:
print("MultitaskClusteredLogisticRegression not available")
exit()
features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat))))
labels = BinaryLabels(hstack((label_train,label_train,label_train)))
n_vectors = features.get_num_vectors()
task_one = Task(0,n_vectors//3)
task_two = Task(n_vectors//3,2*n_vectors//3)
task_three = Task(2*n_vectors//3,n_vectors)
task_group = TaskGroup()
task_group.append_task(task_one)
task_group.append_task(task_two)
task_group.append_task(task_three)
mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2)
#mtlr.io.set_loglevel(MSG_DEBUG)
mtlr.set_tolerance(1e-3) # use 1e-2 tolerance
mtlr.set_max_iter(100)
mtlr.train()
mtlr.set_current_task(0)
#print mtlr.get_w()
out = mtlr.apply_regression().get_labels()
return out
if __name__=='__main__':
print('TransferMultitaskClusteredLogisticRegression')
transfer_multitask_clustered_logistic_regression(*parameter_list[0])
#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
try:
from modshogun import MultitaskL12LogisticRegression
except ImportError:
print("MultitaskL12LogisticRegression not available")
exit(0)
features = RealFeatures(hstack((traindat,traindat)))
labels = BinaryLabels(hstack((label_train,label_train)))
n_vectors = features.get_num_vectors()
task_one = Task(0,n_vectors//2)
task_two = Task(n_vectors//2,n_vectors)
task_group = TaskGroup()
task_group.append_task(task_one)
task_group.append_task(task_two)
mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group)
mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
mtlr.set_max_iter(10)
mtlr.train()
mtlr.set_current_task(0)
out = mtlr.apply_regression().get_labels()
return out
if __name__=='__main__':
print('TransferMultitaskL12LogisticRegression')
transfer_multitask_l12_logistic_regression(*parameter_list[0])
#!/usr/bin/env python
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup
try:
from modshogun import MultitaskLeastSquaresRegression
except ImportError:
print("MultitaskLeastSquaresRegression not available")
exit(0)
features = RealFeatures(traindat)
labels = RegressionLabels(label_train)
n_vectors = features.get_num_vectors()
task_one = Task(0,n_vectors//2)
task_two = Task(n_vectors//2,n_vectors)
task_group = TaskGroup()
task_group.append_task(task_one)
task_group.append_task(task_two)
mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group)
mtlsr.set_regularization(1) # use regularization ratio
mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance
mtlsr.train()
mtlsr.set_current_task(0)
out = mtlsr.apply_regression().get_labels()
return out
if __name__=='__main__':
print('TransferMultitaskLeastSquaresRegression')
transfer_multitask_leastsquares_regression(*parameter_list[0])
#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
try:
from modshogun import MultitaskLogisticRegression
except ImportError:
print("MultitaskLogisticRegression not available")
exit()
features = RealFeatures(hstack((traindat,traindat)))
labels = BinaryLabels(hstack((label_train,label_train)))
n_vectors = features.get_num_vectors()
task_one = Task(0,n_vectors//2)
task_two = Task(n_vectors//2,n_vectors)
task_group = TaskGroup()
task_group.append_task(task_one)
task_group.append_task(task_two)
mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group)
mtlr.set_regularization(1) # use regularization ratio
mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
mtlr.train()
mtlr.set_current_task(0)
out = mtlr.apply().get_labels()
return out
if __name__=='__main__':
print('TransferMultitaskLogisticRegression')
transfer_multitask_logistic_regression(*parameter_list[0])
#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat]]
def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
try:
from modshogun import MultitaskTraceLogisticRegression
except ImportError:
print("MultitaskTraceLogisticRegression not available")
exit(0)
features = RealFeatures(hstack((traindat,traindat)))
labels = BinaryLabels(hstack((label_train,label_train)))
n_vectors = features.get_num_vectors()
task_one = Task(0,n_vectors//2)
task_two = Task(n_vectors//2,n_vectors)
task_group = TaskGroup()
task_group.append_task(task_one)
task_group.append_task(task_two)
mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group)
mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
mtlr.set_max_iter(10)
mtlr.train()
mtlr.set_current_task(0)
out = mtlr.apply_regression().get_labels()
return out
if __name__=='__main__':
print('TransferMultitaskTraceLogisticRegression')
transfer_multitask_trace_logistic_regression(*parameter_list[0])
#!/usr/bin/env python
#
# Copyright (c) The Shogun Machine Learning Toolbox
# Written (w) 2014 Wu Lin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are those
# of the authors and should not be interpreted as representing official policies,
# either expressed or implied, of the Shogun Development Team.
#
#
path='../data'
traindat = '%s/fm_train_real.dat'%path
testdat = '%s/fm_test_real.dat'%path
label_binary_traindat = '%s/label_train_twoclass.dat'%path
try:
from modshogun import GaussianProcessClassification
except ImportError:
print("GaussianProcessClassification is not available")
exit(0)
from modshogun import *
parameter_list=[
[KLCholeskyInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
[KLCovarianceInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
[KLDiagonalInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
[KLDualInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
[SingleLaplaceInferenceMethod,traindat,testdat,label_binary_traindat,0,0],
]
def variational_classifier_modular(kl_inference,train_fname=traindat,test_fname=testdat,
label_fname=label_binary_traindat,kernel_log_sigma=0,kernel_log_scale=0,noise_factor=1e-5,
min_coeff_kernel=1e-2,max_attempt=0):
from math import exp
features_train=RealFeatures(CSVFile(train_fname))
labels_train=BinaryLabels(CSVFile(label_fname))
likelihood=LogitDVGLikelihood()
error_eval=ErrorRateMeasure()
mean_func=ConstMean()
kernel_sigma=2*exp(2*kernel_log_sigma);
kernel_func=GaussianKernel(10, kernel_sigma)
inf=kl_inference(kernel_func, features_train, mean_func, labels_train, likelihood)
try:
inf.set_noise_factor(noise_factor)
inf.set_min_coeff_kernel(min_coeff_kernel)
inf.set_max_attempt(max_attempt)
except:
pass
inf.set_scale(exp(kernel_log_scale))
gp=GaussianProcessClassification(inf)
gp.train()
pred_labels_train=gp.apply_binary(features_train)
error_train=error_eval.evaluate(pred_labels_train, labels_train)
#print "\nInference name:%s"%inf.get_name(),
#print "marginal likelihood:%.10f"%inf.get_negative_log_marginal_likelihood(),
#print "Training error %.4f"%error_train
return pred_labels_train, gp, pred_labels_train.get_labels()
if __name__=="__main__":
print("variational_classifier")
for parameter in parameter_list:
variational_classifier_modular(*parameter)