This page lists ready to run shogun examples for the Python Modular interface.
To run the examples issue
python name_of_example.py
# This example shows how to use a custom defined kernel function for training a
# two class Support Vector Machine (SVM) classifier on a randomly generated
# examples. The SVM regularization constant is set to C=1.
parameter_list = [[1,7],[2,8]]
def classifier_custom_kernel_modular(C=1,dim=7):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import CustomKernel
from shogun.Classifier import LibSVM
from numpy import diag,ones,sign
from numpy.random import rand,seed
seed((C,dim))
lab=sign(2*rand(dim) - 1)
data=rand(dim, dim)
symdata=data*data.T + diag(ones(dim))
kernel=CustomKernel()
kernel.set_full_kernel_matrix_from_full(data)
labels=Labels(lab)
svm=LibSVM(C, kernel, labels)
svm.train()
predictions =svm.classify()
out=svm.classify().get_labels()
return svm,out
if __name__=='__main__':
print 'custom_kernel'
classifier_custom_kernel_modular(*parameter_list[0])
# In this example we demonstrate how to use SVMs in a domain adaptation
# scenario. Here, we assume that we have two problem domains, one with
# an abundance of training data (source domain) and one with only a few
# training examples (target domain). These domains are assumed to be
# different but related enough to transfer information between them.
# Thus, we first train an SVM on the source domain and then subsequently
# pass this previously trained SVM object to the DASVM, that we train
# on the target domain. The DASVM internally computes a custom linear term
# (for the underlying quadratic program of the dual formulation of the SVM)
# based on the support vectors of the source SVM and the training examples
# of the target SVM. Finally, it can be used for prediction just as any other
# SVM object.
#
import numpy
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
from shogun.Classifier import SVMLight, DomainAdaptationSVM, MSG_DEBUG
traindna = ['CGCACGTACGTAGCTCGAT',
'CGACGTAGTCGTAGTCGTA',
'CGACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACCACAGTTATATAGTA',
'CGACGTAGTCGTAGTCGTA',
'CGACGTAGTTTTTTTCGTA',
'CGACGTAGTCGTAGCCCCA',
'CAAAAAAAAAAAAAAAATA',
'CGACGGGGGGGGGGGCGTA']
label_traindna = numpy.array(5*[-1.0] + 5*[1.0])
testdna = ['AGCACGTACGTAGCTCGAT',
'AGACGTAGTCGTAGTCGTA',
'CAACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGAACACAGTTATATAGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACGTGGGGTTTTTCGTA',
'CGACGTAGTCCCAGCCCCA',
'CAAAAAAAAAAAACCAATA',
'CGACGGCCGGGGGGGCGTA']
label_testdna = numpy.array(5*[-1.0] + 5*[1.0])
traindna2 = ['AGACAGTCAGTCGATAGCT',
'AGCAGTCGTAGTCGTAGTC',
'AGCAGGGGGGGGGGTAGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCAACACGTTCTCTCGTC',
'AGCAGTCGTAGTCGTAGTC',
'AGCAGTCGTTTTTTTAGTC',
'AGCAGTCGTAGTCGAAAAC',
'ACCCCCCCCCCCCCCCCTC',
'AGCAGGGGGGGGGGGAGTC']
label_traindna2 = numpy.array(5*[-1.0] + 5*[1.0])
testdna2 = ['CGACAGTCAGTCGATAGCT',
'CGCAGTCGTAGTCGTAGTC',
'ACCAGGGGGGGGGGTAGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCCACACGTTCTCTCGTC',
'AGCAATCGTAGTCGTAGTC',
'AGCAGTGGGGTTTTTAGTC',
'AGCAGTCGTAAACGAAAAC',
'ACCCCCCCCCCCCAACCTC',
'AGCAGGAAGGGGGGGAGTC']
label_testdna2 = numpy.array(5*[-1.0] + 5*[1.0])
parameter_list = [[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
testdna2,label_testdna2,1,3],[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
testdna2,label_testdna2,2,5]]
def classifier_domainadaptationsvm_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
label_train_dna=label_traindna, \
label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):
feats_train = StringCharFeatures(fm_train_dna, DNA)
feats_test = StringCharFeatures(fm_test_dna, DNA)
kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels = Labels(label_train_dna)
svm = SVMLight(C, kernel, labels)
svm.train()
#svm.io.set_loglevel(MSG_DEBUG)
#####################################
#print "obtaining DA SVM from previously trained SVM"
feats_train2 = StringCharFeatures(fm_train_dna, DNA)
feats_test2 = StringCharFeatures(fm_test_dna, DNA)
kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels2 = Labels(label_train_dna)
# we regularize against the previously obtained solution
dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
dasvm.train()
out = dasvm.classify(feats_test2).get_labels()
return out #,dasvm TODO
if __name__=='__main__':
print 'SVMLight'
classifier_domainadaptationsvm_modular(*parameter_list[0])
# In this example a multi-class support vector machine is trained on a toy data
# set and the trained classifier is then used to predict labels of test
# examples. The training algorithm is based on BSVM formulation (L2-soft margin
# and the bias added to the objective function) which is solved by the Improved
# Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian
# kernel of width 2.1 and the regularization constant C=1. The solver stops if the
# relative duality gap falls below 1e-5.
#
# For more details on the used SVM solver see
# V.Franc: Optimization Algorithms for Kernel Methods. Research report.
# CTU-CMP-2005-22. CTU FEL Prague. 2005.
# ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf .
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_gmnpsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GMNPSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train_multiclass)
svm=GMNPSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train(feats_train)
#kernel.init(feats_train, feats_test)
out=svm.classify(feats_test).get_labels()
return out,kernel
if __name__=='__main__':
print 'GMNPSVM'
classifier_gmnpsvm_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm Gradient Projection Decomposition Technique
# (GPDT) is used with SVM regularization parameter C=1 and a Gaussian
# kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution.
#
# For more details on GPDT solver see http://dm.unife.it/gpdt .
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_gpbtsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import GPBTSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train_twoclass)
svm=GPBTSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'GPBTSVM'
classifier_gpbtsvm_modular(*parameter_list[0])
# This example shows usage of a k-nearest neighbor (KNN) classification rule on
# a toy data set. The number of the nearest neighbors is set to k=3 and the distances
# are measured by the Euclidean metric. Finally, the KNN rule is applied to predict
# labels of test examples.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list = [[traindat,testdat,label_traindat,3],[traindat,testdat,label_traindat,3]]
def classifier_knn_modular(fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat, k=3 ):
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import KNN
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
labels=Labels(label_train_multiclass)
knn=KNN(k, distance, labels)
knn_train = knn.train()
output=knn.classify(feats_test).get_labels()
return knn,knn_train,output
if __name__=='__main__':
print 'KNN'
classifier_knn_modular(*parameter_list[0])
# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm the LaRank algorithm is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision
# set to epsilon=1e-5.
#
# For more details on LaRank see
# Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J.
# Solving MultiClass Support Vector Machines with LaRank. ICML 2007.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]]
def classifier_larank_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,C=0.9,num_threads=1,num_iter=5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LaRank
from shogun.Library import Math_init_random
Math_init_random(17)
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
C=1
epsilon=1e-5
labels=Labels(label_train_multiclass)
svm=LaRank(C, kernel, labels)
#svm.set_tau(1e-3)
#svm.set_batch_mode(False)
#svm.io.enable_progress()
svm.set_epsilon(epsilon)
svm.train()
out=svm.classify(feats_train).get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'LaRank'
classifier_larank_modular(*parameter_list[0])
# In this example a two-class linear classifier based on the Linear Discriminant
# Analysis (LDA) is trained on a toy data set and then the trained classifier is
# used to predict test examples. The regularization parameter, which corresponds
# to a weight of a unitary matrix added to the covariance matrix, is set to
# gamma=3.
#
# For more details on the LDA see e.g.
# http://en.wikipedia.org/wiki/Linear_discriminant_analysis
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,3,1],[traindat,testdat,label_traindat,4,1]]
def classifier_lda_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,gamma=3,num_threads=1):
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import LDA
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=Labels(label_train_twoclass)
lda=LDA(gamma, feats_train, labels)
lda.train()
lda.get_bias()
lda.get_w()
lda.set_features(feats_test)
lda.classify().get_labels()
return lda,lda.classify().get_labels()
if __name__=='__main__':
print 'LDA'
classifier_lda_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is then used to predict labels of
# test examples. As training algorithm the LIBLINEAR solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameters epsilon=1e-5.
#
# For more details on LIBLINEAR see
# http://www.csie.ntu.edu.tw/~cjlin/liblinear/
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-3],[traindat,testdat,label_traindat,0.8,1e-2]]
def classifier_liblinear_modular(fm_train_real, fm_test_real,
label_train_twoclass, C, epsilon):
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC_DUAL
from shogun.Library import Math_init_random
Math_init_random(17)
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=Labels(label_train_twoclass)
svm=LibLinear(C, feats_train, labels)
svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
svm.set_epsilon(epsilon)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'LibLinear'
classifier_liblinear_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # 2-dimensional randomly generated data set and the trained classifier is used to # predict labels of test examples. As training algorithm the LIBSVM solver is used # with SVM regularization parameter C=1 and a Gaussian kernel of width 2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ from numpy import * from numpy.random import randn from shogun.Features import * from shogun.Classifier import * from shogun.Kernel import * num=1000 dist=1 width=2.1 C=1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); labels=Labels(trainlab); svm=LibSVM(C, kernel, labels); svm.train(); kernel.init(feats_train, feats_test); out=svm.classify().get_labels(); testerr=mean(sign(out)!=testlab) print testerr
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the LIBSVM solver is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5. The example also shows how to retrieve the
# support vectors from the train SVM model.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_libsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train_twoclass)
svm=LibSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
labels = svm.classify().get_labels()
supportvectors = sv_idx=svm.get_support_vectors()
alphas=svm.get_alphas()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'LibSVM'
classifier_libsvm_modular(*parameter_list[0])
# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the LIBSVM solver is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]
def classifier_libsvmmulticlass_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMMultiClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train_multiclass)
svm=LibSVMMultiClass(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
out = svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'LibSVMMultiClass'
classifier_libsvmmulticlass_modular(*parameter_list[0])
# In this example a one-class support vector machine classifier is trained on a
# toy data set. The training algorithm finds a hyperplane in the RKHS which
# separates the training data from the origin. The one-class classifier is
# typically used to estimate the support of a high-dimesnional distribution.
# For more details see e.g.
# B. Schoelkopf et al. Estimating the support of a high-dimensional
# distribution. Neural Computation, 13, 2001, 1443-1471.
#
# In the example, the one-class SVM is trained by the LIBSVM solver with the
# regularization parameter C=1 and the Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,2.2,1,1e-7],[traindat,testdat,2.1,1,1e-5]]
def classifier_libsvmoneclass_modular (fm_train_real=traindat,fm_test_real=testdat,width=2.1,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import LibSVMOneClass
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
svm=LibSVMOneClass(C, kernel)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'LibSVMOneClass'
classifier_libsvmoneclass_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Minimal Primal Dual SVM is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 1.2 and the
# precision parameter 1e-5.
#
# For more details on the MPD solver see
# Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple
# Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell,
# J. G., J. Siekmann, Springer, Berlin, Germany (11 2005)
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,1,1e-5],[traindat,testdat,label_traindat,0.9,1e-5]]
def classifier_mpdsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=1,epsilon=1e-5):
from shogun.Features import RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Classifier import MPDSVM
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
width=2.1
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train_twoclass)
svm=MPDSVM(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'MPDSVM'
classifier_mpdsvm_modular(*parameter_list[0])
# This example shows usage of the Perceptron algorithm for training a two-class
# linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm works by
# iteratively passing though the training examples and applying the update rule on
# those examples which are misclassified by the current classifier. The Perceptron
# update rule reads
#
# w(t+1) = w(t) + alpha * y_t * x_t
# b(t+1) = b(t) + alpha * y_t
#
# where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example
# (w(t),b(t)) are the current parameters of the linear classifier
# (w(t+1),b(t+1)) are the new parameters of the linear classifier
# alpha is the learning rate; in this examples alpha=1
#
# The Perceptron algorithm iterates until all training examples are correctly
# classified or the prescribed maximal number of iterations, in this example
# max_iter=1000, is reached.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,1.,1000,1],[traindat,testdat,label_traindat,1.,1000,1]]
def classifier_perceptron_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,learn_rate=1.,max_iter=1000,num_threads=1):
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import Perceptron
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
labels=Labels(label_train_twoclass)
perceptron=Perceptron(feats_train, labels)
perceptron.set_learn_rate(learn_rate)
perceptron.set_max_iter(max_iter)
# only guaranteed to converge for separable data
perceptron.train()
perceptron.set_features(feats_test)
out_labels = perceptron.classify().get_labels()
return perceptron, out_labels
if __name__=='__main__':
print 'Perceptron'
classifier_perceptron_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the steepest descent subgradient algorithm is
# used. The SVM regularization parameter is set to C=0.9 and the bias in the
# classification rule is switched off. The solver iterates until it finds an
# epsilon-precise solution (epsilon=1e-3) or the maximal training time
# max_train_time=1 (seconds) is exceeded. The unbiased linear rule is trained.
#
# Note that this solver often does not converges because the steepest descent
# subgradient algorithm is oversensitive to rounding errors. Note also that this
# is an unpublished work which was predecessor of the OCAS solver (see
# classifier_svmocas).
from tools.load import LoadMatrix
lm=LoadMatrix()
train=lm.load_numbers('../data/fm_train_real.dat')
test=lm.load_numbers('../data/fm_test_real.dat')
labels=lm.load_labels('../data/label_train_twoclass.dat')
parameter_list=[[train,test,labels,5,1e-3,3.0], [train,test,labels,0.9,1e-2,1.0]]
def classifier_subgradientsvm_modular(fm_train_real, fm_test_real,
label_train_twoclass, C, epsilon, max_train_time):
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SubGradientSVM
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
labels=Labels(label_train_twoclass)
svm=SubGradientSVM(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.set_max_train_time(max_train_time)
svm.train()
svm.set_features(feats_test)
labels=svm.classify().get_labels()
return labels, svm
if __name__=='__main__':
print 'SubGradientSVM'
classifier_subgradientsvm_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and
# a precision parameter epsilon=1e-5. The LINADD trick is used to speed up
# training.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel and the LINADD trick see
# Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String
# Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston,
# Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press,
# Cambridge, MA. 2007.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
train_dna=lm.load_dna('../data/fm_train_dna.dat')
test_dna=lm.load_dna('../data/fm_test_dna.dat')
label=lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[train_dna, test_dna, label, 20, 0.9, 1e-3, 1],
[train_dna, test_dna, label, 20, 2.3, 1e-5, 4]]
def classifier_svmlight_batch_linadd_modular(fm_train_dna, fm_test_dna,
label_train_dna, degree, C, epsilon, num_threads):
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
#print 'SVMLight Objective: %f num_sv: %d' % \
# (svm.get_objective(), svm.get_num_support_vectors())
svm.set_batch_computation_enabled(False)
svm.set_linadd_enabled(False)
svm.classify().get_labels()
svm.set_batch_computation_enabled(True)
labels = svm.classify().get_labels()
return labels, svm
if __name__=='__main__':
print 'SVMlight batch'
classifier_svmlight_batch_linadd_modular(*parameter_list[0])
# This example demonstrates how to train an SVMLight classifier
# using a custom linear term. This is used in the class DASVM that
# pre-computes this linear term using a previously trained SVM.
#
import numpy
traindna=['CGCACGTACGTAGCTCGAT',
'CGACGTAGTCGTAGTCGTA',
'CGACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACCACAGTTATATAGTA',
'CGACGTAGTCGTAGTCGTA',
'CGACGTAGTTTTTTTCGTA',
'CGACGTAGTCGTAGCCCCA',
'CAAAAAAAAAAAAAAAATA',
'CGACGGGGGGGGGGGCGTA']
label_traindna=numpy.array(5*[-1.0] + 5*[1.0])
testdna=['AGCACGTACGTAGCTCGAT',
'AGACGTAGTCGTAGTCGTA',
'CAACGGGGGGGGGGTCGTA',
'CGACCTAGTCGTAGTCGTA',
'CGAACACAGTTATATAGTA',
'CGACCTAGTCGTAGTCGTA',
'CGACGTGGGGTTTTTCGTA',
'CGACGTAGTCCCAGCCCCA',
'CAAAAAAAAAAAACCAATA',
'CGACGGCCGGGGGGGCGTA']
label_test_dna=numpy.array(5*[-1.0] + 5*[1.0])
parameter_list = [[traindna,testdna,label_traindna,3,10,1e-5,1],[traindna,testdna,label_traindna,3,10,1e-5,1]]
def classifier_svmlight_linear_term_modular(fm_train_dna=traindna,fm_test_dna=testdna, \
label_train_dna=label_traindna,degree=3, \
C=10,epsilon=1e-5,num_threads=1):
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
from shogun.Classifier import SVMLight
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_qpsize(3)
svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
out = svm.classify().get_labels()
return out,kernel
if __name__=='__main__':
print 'SVMLight'
classifier_svmlight_linear_term_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
#
# For more details on the Weighted Degree kernel see
# G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
# spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]]
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
from shogun.Features import StringCharFeatures, Labels, DNA
from shogun.Kernel import WeightedDegreeStringKernel
try:
from shogun.Classifier import SVMLight
except ImportError:
print 'No support for SVMLight available.'
return
feats_train=StringCharFeatures(DNA)
feats_train.set_features(fm_train_dna)
feats_test=StringCharFeatures(DNA)
feats_test.set_features(fm_test_dna)
degree=20
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
labels=Labels(label_train_dna)
svm=SVMLight(C, kernel, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.train()
kernel.init(feats_train, feats_test)
svm.classify().get_labels()
return kernel
if __name__=='__main__':
print 'SVMLight'
classifier_svmlight_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier (SVM) is
# trained on a toy data set and the trained classifier is used to predict labels
# of test examples. As training algorithm the SVMLIN solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameter epsilon=1e-5. The example also shows how to
# retrieve parameters (vector w and bias b)) of the trained linear classifier.
#
# For more details on the SVMLIN solver see
# V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised
# Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]
def classifier_svmlin_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMLin
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
labels=Labels(label_train_twoclass)
svm=SVMLin(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(True)
svm.train()
svm.set_features(feats_test)
svm.get_bias()
svm.get_w()
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'SVMLin'
classifier_svmlin_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the OCAS solver is used with the SVM
# regularization parameter C=0.9 and the bias term in the classification rule
# switched off and the precision parameter epsilon=1e-5 (duality gap).
#
# For more details on the OCAS solver see
# V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk
# Minimization.The Journal of Machine Learning Research, vol. 10,
# pp. 2157--2192. October 2009.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]
def classifier_svmocas_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMOcas
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
labels=Labels(label_train_twoclass)
svm=SVMOcas(C, feats_train, labels)
svm.set_epsilon(epsilon)
svm.parallel.set_num_threads(num_threads)
svm.set_bias_enabled(False)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'SVMOcas'
classifier_svmocas_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is
# used with the SVM regularization parameter C=0.9. The number of iterations, i.e.
# passes though all training examples, is set to num_iter=5 .
#
# For more details on the SGD solver see
# L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT
# Press. 2008.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]]
def classifier_svmsgd_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,num_threads=1,num_iter=5):
from shogun.Features import RealFeatures, SparseRealFeatures, Labels
from shogun.Classifier import SVMSGD
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
labels=Labels(label_train_twoclass)
svm=SVMSGD(C, feats_train, labels)
svm.set_epochs(num_iter)
#svm.io.set_loglevel(0)
svm.train()
svm.set_features(feats_test)
svm.classify().get_labels()
predictions = svm.classify()
return predictions, svm, predictions.get_labels()
if __name__=='__main__':
print 'SVMSGD'
classifier_svmsgd_modular(*parameter_list[0])
# In this example an agglomerative hierarchical single linkage clustering method
# is used to cluster a given toy data set. Starting with each object being
# assigned to its own cluster clusters are iteratively merged. Here the clusters
# are merged that have the closest (minimum distance, here set via the Euclidean
# distance object) two elements.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[traindat,3],[traindat,4]]
def clustering_hierarchical_modular (fm_train=traindat,merges=3):
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import Hierarchical
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
hierarchical=Hierarchical(merges, distance)
hierarchical.train()
out_distance = hierarchical.get_merge_distances()
out_cluster = hierarchical.get_cluster_pairs()
return hierarchical,out_distance,out_cluster
if __name__=='__main__':
print 'Hierarchical'
clustering_hierarchical_modular(*parameter_list[0])
# In this example the k-means clustering method is used to cluster a given toy
# data set. In k-means clustering one tries to partition n observations into k
# clusters in which each observation belongs to the cluster with the nearest mean.
# The algorithm class constructor takes the number of clusters and a distance to
# be used as input. The distance used in this example is Euclidean distance.
# After training one can fetch the result of clustering by obtaining the cluster
# centers and their radiuses.
##!/usr/bin/env python
#"""
#Explicit examples on how to use clustering
#"""
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
parameter_list = [[traindat,3],[traindat,4]]
def clustering_kmeans_modular (fm_train=traindat,k=3):
from shogun.Distance import EuclidianDistance
from shogun.Features import RealFeatures
from shogun.Clustering import KMeans
from shogun.Library import Math_init_random
Math_init_random(17)
feats_train=RealFeatures(fm_train)
distance=EuclidianDistance(feats_train, feats_train)
kmeans=KMeans(k, distance)
kmeans.train()
out_centers = kmeans.get_cluster_centers()
kmeans.get_radiuses()
return out_centers, kmeans
if __name__=='__main__':
print 'KMeans'
clustering_kmeans_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
#
# Obviously, using the Bray Curtis distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_braycurtis_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import BrayCurtisDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=BrayCurtisDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'BrayCurtisDistance'
distance_braycurtis_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCanberraMetric.html.
#
# Obviously, using the Canberra distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_canberra_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import CanberraMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CanberraMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'CanberaMetric'
distance_canberra_modular(*parameter_list[0])
# This example shows how to compute the Canberra Word Distance.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,3,0,False]]
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import CanberraWordDistance
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=CanberraWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'CanberraWordDistance'
distance_canberraword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (maximum of absolute feature dimension differences) matrix is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (maximum of absolute feature dimension differences) matrix between these
# two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
#
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_chebyshew_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import ChebyshewMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChebyshewMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'ChebyshewMetric'
distance_chebyshew_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
#
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,],[traindat,testdat]]
def distance_chisquare_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import ChiSquareDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ChiSquareDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'ChiSquareDistance'
distance_chisquare_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CCosineDistance.html.
#
# Obviously, using the Cosine distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_cosine_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import CosineDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=CosineDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'CosineDistance'
distance_cosine_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CEuclidianDistance.html.
#
# Obviously, using the Euclidian distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_euclidian_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'EuclidianDistance'
distance_euclidian_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a
# pairwise distance (shortest path on a sphere) matrix is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (shortest path on a sphere) matrix between these two data sets is
# computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
#
# Obviously, using the Geodesic distance is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_geodesic_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import GeodesicMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=GeodesicMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'GeodesicMetric'
distance_geodesic_modular(*parameter_list[0])
# This example shows how to compute the Hamming Word Distance for string features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_labels('../data/fm_test_real.dat')
parameter_list = [[traindna,testdna,testdat,4,0,False,False],
[traindna,testdna,testdat,3,0,False,False]]
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import HammingWordDistance
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=HammingWordDistance(feats_train, feats_train, use_sign)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'HammingWordDistance'
distance_hammingword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (divergence measure based on the Kullback-Leibler divergence) matrix
# is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (divergence measure based on the Kullback-Leibler divergence) matrix between
# these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CJensenMetric.html.
#
# Obviously, using the Jensen-Shannon distance/divergence is not limited to
# this showcase example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_jensen_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import JensenMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=JensenMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'JensenMetric'
distance_jensen_modular(*parameter_list[0])
# This example shows how to compute the Manhatten Distance.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_manhatten_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import ManhattanMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=ManhattanMetric(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'ManhattanMetric'
distance_manhatten_modular(*parameter_list[0])
# This example shows how to compute the Manahattan Distance for string features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindna,testdna,testdat,3,0,False],[traindna,testdna,testdat,4,0,False]]
def distance_manhattenword_modular (fm_train_dna=traindna ,fm_test_dna=testdna,fm_test_real=testdat,order=3,gap=0,reverse=False):
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
from shogun.Distance import ManhattanWordDistance
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
distance=ManhattanWordDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return dm_train,dm_test
if __name__=='__main__':
print 'ManhattanWordDistance'
distance_manhattenword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) and norm 'k' controls the processing of the given data points,
# where a pairwise distance matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
#
# Obviously, using the Minkowski metric is not limited to this showcase
# example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,3],[traindat,testdat,4]]
def distance_minkowski_modular (fm_train_real=traindat,fm_test_real=testdat,k=3):
from shogun.Features import RealFeatures
from shogun.Distance import MinkowskiMetric
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=MinkowskiMetric(feats_train, feats_train, k)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'MinkowskiMetric'
distance_minkowski_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_normsquared_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance(feats_train, feats_train)
distance.set_disable_sqrt(True)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'EuclidianDistance - NormSquared'
distance_normsquared_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_sparseeuclidean_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures, SparseRealFeatures
from shogun.Distance import SparseEuclidianDistance
realfeat=RealFeatures(fm_train_real)
feats_train=SparseRealFeatures()
feats_train.obtain_from_simple(realfeat)
realfeat=RealFeatures(fm_test_real)
feats_test=SparseRealFeatures()
feats_test.obtain_from_simple(realfeat)
distance=SparseEuclidianDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'SparseEuclidianDistance'
distance_sparseeuclidean_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
#
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
#
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
#
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# The method call 'init'* binds the given data sets, where a pairwise distance
# (extended Jaccard coefficient) matrix between these two data sets is computed
# by 'get_distance_matrix'.
#
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
#
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
#
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
#
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def distance_tanimoto_modular (fm_train_real=traindat,fm_test_real=testdat):
from shogun.Features import RealFeatures
from shogun.Distance import TanimotoDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=TanimotoDistance(feats_train, feats_train)
dm_train=distance.get_distance_matrix()
distance.init(feats_train, feats_test)
dm_test=distance.get_distance_matrix()
return distance,dm_train,dm_test
if __name__=='__main__':
print 'TanimotoDistance'
distance_tanimoto_modular(*parameter_list[0])
# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]
def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import Histogram
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
histo=Histogram(feats)
histo.train()
histo.get_histogram()
num_examples=feats.get_num_vectors()
num_param=histo.get_num_model_parameters()
#for i in xrange(num_examples):
# for j in xrange(num_param):
# histo.get_log_derivative(j, i)
out_likelihood = histo.get_log_likelihood()
out_sample = histo.get_log_likelihood_sample()
return histo,out_sample,out_likelihood
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
print 'Histogram'
distribution_histogram_modular(*parameter_list[0])
# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set. After calling the constructor of the HMM class specifying
# the number of states and transitions the model is trained. Via the Baum-Welch
# algorithm the optimal transition and emission probabilities are estimated. The
# best path, i.e. the path with highest probability given the model can then be
# calculated using get_best_path_state.
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_cubes('../data/fm_train_cube.dat')
parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]]
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
from shogun.Distribution import HMM, BW_NORMAL
charfeat=StringCharFeatures(CUBE)
charfeat.set_features(fm_cube)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=HMM(feats, N, M, pseudo)
hmm.train()
hmm.baum_welch_viterbi_train(BW_NORMAL)
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
best_path=0
best_path_state=0
for i in xrange(num_examples):
best_path+=hmm.best_path(i)
for j in xrange(N):
best_path_state+=hmm.get_best_path_state(i, j)
lik_example = hmm.get_log_likelihood()
lik_sample = hmm.get_log_likelihood_sample()
return lik_example, lik_sample, hmm
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
print 'HMM'
distribution_hmm_modular(*parameter_list[0])
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to
# the structure of the Markov chain it is very similar to a HMM with just one
# chain of connected hidden states - that is why we termed this linear HMM.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.Distribution import LinearHMM
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_dna)
feats=StringWordFeatures(charfeat.get_alphabet())
feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
hmm=LinearHMM(feats)
hmm.train()
hmm.get_transition_probs()
num_examples=feats.get_num_vectors()
num_param=hmm.get_num_model_parameters()
for i in xrange(num_examples):
for j in xrange(num_param):
hmm.get_log_derivative(j, i)
out_likelihood = hmm.get_log_likelihood()
out_sample = hmm.get_log_likelihood_sample()
return hmm,out_likelihood ,out_sample
###########################################################################
# call functions
###########################################################################
if __name__=='__main__':
distribution_linearhmm_modular(*parameter_list[0])
print 'LinearHMM'
# This example shows how to read and write plain ascii files, binary files and
# hdf5 datasets.
#
# For ascii files it shows how to obtain shogun's RealFeatures
# (a simple feature matrix of doubles with 1 column == 1 example, nr_columns ==
# number of examples) and also sparse features in SVM light format.
#
# Binary files use some custom native format and datasets can be read/written
# from/to hdf5 files with arbitrary group / path.
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_numbers('../data/fm_train_real.dat')
label=lm.load_numbers('../data/label_train_twoclass.dat')
parameter_list=[[data,label]]
def features_io_modular(fm_train_real, label_train_twoclass):
import numpy
from shogun.Features import SparseRealFeatures, RealFeatures, Labels
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile, HDF5File
feats=SparseRealFeatures(fm_train_real)
feats2=SparseRealFeatures()
f=BinaryFile("fm_train_sparsereal.bin","w")
feats.save(f)
f=AsciiFile("fm_train_sparsereal.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_sparsereal.bin")
feats2.load(f)
f=AsciiFile("fm_train_sparsereal.ascii")
feats2.load(f)
feats=RealFeatures(fm_train_real)
feats2=RealFeatures()
f=BinaryFile("fm_train_real.bin","w")
feats.save(f)
f=HDF5File("fm_train_real.h5","w", "/data/doubles")
feats.save(f)
f=AsciiFile("fm_train_real.ascii","w")
feats.save(f)
f=BinaryFile("fm_train_real.bin")
feats2.load(f)
#print "diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
f=AsciiFile("fm_train_real.ascii")
feats2.load(f)
#print "diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))
lab=Labels(numpy.array([1.0,2.0,3.0]))
lab2=Labels()
f=AsciiFile("label_train_twoclass.ascii","w")
lab.save(f)
f=BinaryFile("label_train_twoclass.bin","w")
lab.save(f)
f=HDF5File("label_train_real.h5","w", "/data/labels")
lab.save(f)
f=AsciiFile("label_train_twoclass.ascii")
lab2.load(f)
f=BinaryFile("label_train_twoclass.bin")
lab2.load(f)
f=HDF5File("fm_train_real.h5","r", "/data/doubles")
feats2.load(f)
#print feats2.get_feature_matrix()
f=HDF5File("label_train_real.h5","r", "/data/labels")
lab2.load(f)
#print lab2.get_labels()
#clean up
import os
for f in ['fm_train_sparsereal.bin','fm_train_sparsereal.ascii',
'fm_train_real.bin','fm_train_real.h5','fm_train_real.ascii',
'label_train_real.h5', 'label_train_twoclass.ascii','label_train_twoclass.bin']:
os.unlink(f)
return feats, feats2, lab, lab2
if __name__=='__main__':
print 'Features IO'
features_io_modular(*parameter_list[0])
# This example demonstrates how to read and write data in the SVMLight Format
# from Shogun.
#
parameter_list=[['../data/train_sparsereal.light']]
def features_read_svmlight_format_modular(fname):
import os
from shogun.Features import SparseRealFeatures
f=SparseRealFeatures()
lab=f.load_svmlight_file(fname)
f.write_svmlight_file('testwrite.light', lab)
os.unlink('testwrite.light')
if __name__=='__main__':
print 'Reading SVMLIGHT format'
features_read_svmlight_format_modular(*parameter_list[0])
# This example demonstrates how to encode small positive natural numbers # (up to 255) in shogun using ByteFeatures. import numpy # create dense matrix A A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.uint8) parameter_list=[[A]] def features_simple_byte_modular(A): from shogun.Features import ByteFeatures # create dense features a # ... of type Byte a=ByteFeatures(A) # print some statistics about a #print a.get_num_vectors() #print a.get_num_features() # get first feature vector and set it #print a.get_feature_vector(0) a.set_feature_vector(numpy.array([1,4,0,0,0,9], dtype=numpy.uint8), 0) # get matrix a_out = a.get_feature_matrix() #print type(a_out), a_out.dtype #print a_out assert(numpy.all(a_out==A)) return a_out,a if __name__=='__main__': print 'ByteFeatures' features_simple_byte_modular(*parameter_list[0])
# This example demonstrates, how to encode features composed of 64bit Integers in Shogun
# using LongIntFeatures.
from shogun.Features import LongIntFeatures
from numpy import array, int64, all
# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
parameter_list = [[matrix]]
# ... of type LongInt
def features_simple_longint_modular(A=matrix):
a=LongIntFeatures(A)
# get first feature vector and set it
a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0)
# get matrix
a_out = a.get_feature_matrix()
assert(all(a_out==A))
return a_out
if __name__=='__main__':
print 'simple_longint'
features_simple_longint_modular(*parameter_list[0])
# This example shows how to encode features that live in various vector spaces
# using the appropriate shogun objects. We demonstrate how to use
# three types of features: ByteFeatures (small integer values),
# LongIntFeatures (large integer values) and finally RealFeatures
# (real-valued vectors).
from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures
from numpy import array, float64, int64, uint8, all
# create dense matrices A,B,C
matrixA=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
matrixB=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
matrixC=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8)
# ... of type Real, LongInt and Byte
parameter_list = [[matrixA,matrixB,matrixC]]
def features_simple_modular(A=matrixA,B=matrixB,C=matrixC):
a=RealFeatures(A)
b=LongIntFeatures(B)
c=ByteFeatures(C)
# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))
# print some statistics about a
# get first feature vector and set it
a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)
# get matrices
a_out = a.get_feature_matrix()
b_out = b.get_feature_matrix()
c_out = c.get_feature_matrix()
assert(all(a_out==A))
assert(all(b_out==B))
assert(all(c_out==C))
return a_out,b_out,c_out,a,b,c
if __name__=='__main__':
print 'simple'
features_simple_modular(*parameter_list[0])
# This examples demonstrates how to encode real-valued features in Shogun,
# using RealFeatures.
from shogun.Features import RealFeatures
from numpy import array, float64, all
# create dense matrices A,B,C
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
parameter_list = [[matrix]]
# ... of type LongInt
def features_simple_real_modular(A=matrix):
# ... of type Real, LongInt and Byte
a=RealFeatures(A)
# print some statistics about a
#print a.get_num_vectors()
#print a.get_num_features()
# get first feature vector and set it
#print a.get_feature_vector(0)
a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)
# get matrix
a_out = a.get_feature_matrix()
assert(all(a_out==A))
return a_out
if __name__=='__main__':
print 'simple_real'
features_simple_real_modular(*parameter_list[0])
# Creates features similar to the feature space of the SNP kernel. Useful when # working with linear methods. parameter_list=[['../data/snps.dat']] def features_snp_modular(fname): from shogun.Features import StringByteFeatures, SNPFeatures, SNP sf=StringByteFeatures(SNP) sf.load_ascii_file(fname, False, SNP, SNP) #print sf.get_features() snps=SNPFeatures(sf) #print snps.get_feature_matrix() #print snps.get_minor_base_string() #print snps.get_major_base_string() if __name__=='__main__': print 'SNP Features' features_snp_modular(*parameter_list[0])
# This example demsonstrates how to encode sparse (most entries zero), # real-valued features in shogun using SparseRealFeatures. import numpy # create dense matrix A A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.float64) parameter_list=[[A]] def features_sparse_modular(A): from scipy.sparse import csc_matrix from shogun.Features import SparseRealFeatures from numpy import array, float64, all # sparse representation X of dense matrix A # note, will work with types other than float64 too, # but requires recent scipy.sparse X=csc_matrix(A) #print A # create sparse shogun features from dense matrix A a=SparseRealFeatures(A) a_out=a.get_full_feature_matrix() #print a_out assert(all(a_out==A)) #print a_out # create sparse shogun features from sparse matrix X a.set_sparse_feature_matrix(X) a_out=a.get_full_feature_matrix() #print a_out assert(all(a_out==A)) # create sparse shogun features from sparse matrix X a=SparseRealFeatures(X) a_out=a.get_full_feature_matrix() #print a_out assert(all(a_out==A)) # obtain (data,row,indptr) csc arrays of sparse shogun features z=csc_matrix(a.get_sparse_feature_matrix()) z_out=z.todense() #print z_out assert(all(z_out==A)) if __name__=='__main__': print 'Sparse Features' features_sparse_modular(*parameter_list[0])
# This example demonstrates how to use compressed strings with shogun.
# We currently support reading and writing compressed files using
# LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract
# compressed streams on-the-fly in order to fit data sets into
# memory that would be too large, otherwise.
#
parameter_list = [['features_string_char_compressed_modular.py']]
def features_string_char_compressed_modular(fname):
from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE
from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
from shogun.PreProc import DecompressCharString
f=StringFileCharFeatures(fname, RAWBYTE)
#print "original strings", f.get_features()
#uncompressed
f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_uncompressed.str", True)
#print "uncompressed strings", f2.get_features()
#print
# load compressed data and uncompress on load
#lzo
f.save_compressed("foo_lzo.str", LZO, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", True)
#print "lzo strings", f2.get_features()
#print
##gzip
f.save_compressed("foo_gzip.str", GZIP, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_gzip.str", True)
#print "gzip strings", f2.get_features()
#print
#bzip2
f.save_compressed("foo_bzip2.str", BZIP2, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_bzip2.str", True)
#print "bzip2 strings", f2.get_features()
#print
#lzma
f.save_compressed("foo_lzma.str", LZMA, 9)
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzma.str", True)
#print "lzma strings", f2.get_features()
#print
# load compressed data and uncompress via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
f2.add_preproc(DecompressCharString(LZO))
f2.apply_preproc()
#print "lzo strings", f2.get_features()
#print
# load compressed data and uncompress on-the-fly via preprocessor
f2=StringCharFeatures(RAWBYTE);
f2.load_compressed("foo_lzo.str", False)
#f2.io.set_loglevel(MSG_DEBUG)
f2.add_preproc(DecompressCharString(LZO))
f2.enable_on_the_fly_preprocessing()
#print "lzo strings", f2.get_features()
#print
#clean up
import os
for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str',
'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']:
if os.path.exists(f):
os.unlink(f)
##########################################################################################
# some perfectly compressible stuff follows
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
##########################################################################################
if __name__=='__main__':
print 'Compressing StringCharFileFeatures'
features_string_char_compressed_modular(*parameter_list[0])
# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun. strings=['hey','guys','i','am','a','string'] parameter_list=[[strings]] def features_string_char_modular(strings): from shogun.Features import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #and output several stats #print "max string length", f.get_max_vector_length() #print "number of strings", f.get_num_vectors() #print "length of first string", f.get_vector_length(0) #print "string[5]", ''.join(f.get_feature_vector(5)) #print "strings", f.get_features() #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) #print "strings", f.get_features() return f.get_features(), f if __name__=='__main__': print 'StringCharFeatures' features_string_char_modular(*parameter_list[0])
# This example demonstrates how to load ASCII features from a file into shogun.
parameter_list = [['features_string_file_char_modular.py']]
def features_string_file_char_modular(fname):
from shogun.Features import StringFileCharFeatures, RAWBYTE
f = StringFileCharFeatures(fname, RAWBYTE)
#print "strings", f.get_features()
return f
if __name__=='__main__':
print 'Compressing StringCharFileFeatures'
features_string_file_char_modular(*parameter_list[0])
# This example demonstrates how to load string features from files.
# We cover two cases: First, we show how to obtain StringCharFeatues
# from a directory of text files (particularly useful in computational biology)
# and second, we demonstrate how to load StringCharFeatues from one (multi-line) file.
#
parameter_list=[[".", "features_string_char_modular.py"]]
def features_string_file_modular(directory, fname):
from shogun.Features import StringCharFeatures, RAWBYTE
from shogun.Library import AsciiFile
# load features from directory
f=StringCharFeatures(RAWBYTE)
f.load_from_directory(directory)
#and output several stats
#print "max string length", f.get_max_vector_length()
#print "number of strings", f.get_num_vectors()
#print "length of first string", f.get_vector_length(0)
#print "str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)
#print "len(str[0])", f.get_vector_length(0)
#print "str[0]", f.get_feature_vector(0)
#or load features from file (one string per line)
fil=AsciiFile(fname)
f.load(fil)
#print f.get_features()
#or load fasta file
#f.load_fasta('fasta.fa')
#print f.get_features()
return f.get_features(), f
if __name__=='__main__':
print 'StringWordFeatures'
features_string_file_modular(*parameter_list[0])
# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted
# Degree kernel feature space via hashes. These features can be particularly fast
# in linear SVM solvers.
from shogun.Features import LongIntFeatures
from numpy import array, int64, all
# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
parameter_list = [[matrix,3,1,2],[matrix,3,1,2]]
# ... of type LongInt
def features_string_hashed_wd_modular(A=matrix,order=3,start_order=1,hash_bits=2):
a=LongIntFeatures(A)
from numpy import *
from shogun.Features import *
from shogun.Library import MSG_DEBUG
x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)]
from_order=order
f=StringByteFeatures(RAWDNA)
#f.io.set_loglevel(MSG_DEBUG)
f.set_features(x)
y=HashedWDFeatures(f,start_order,order,from_order,hash_bits)
fm=y.get_feature_matrix()
return fm
if __name__=='__main__':
print 'string_hashed_wd'
features_string_hashed_wd_modular(*parameter_list[0])
# In this example, we demonstrate how to obtain string features # by using a sliding window in a memory-efficient way. Instead of copying # the string for each position of the sliding window, we only store a reference # with respect to the complete string. This is particularly useful, when working # with genomic data, where storing all explicitly copied strings in memory # quickly becomes infeasible. In addition to a sliding window (of a particular # length) over all position, we also support defining a custom position # list. # create string features with a single string s=10*'A' + 10*'C' + 10*'G' + 10*'T' parameter_list=[[s]] def features_string_sliding_window_modular(strings): from shogun.Features import StringCharFeatures, DNA from shogun.Library import DynamicIntArray f=StringCharFeatures([strings], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) #print f.get_num_vectors() #print f.get_vector_length(0) #print f.get_vector_length(1) #print f.get_features() # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) #print f.get_num_vectors() #print f.get_vector_length(0) #print f.get_vector_length(1) #print f.get_features() # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) #print f.get_features() # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) #print f.get_features() return f if __name__=='__main__': print 'Sliding Window' features_string_sliding_window_modular(*parameter_list[0])
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 64bit (ulong).
parameter_list = [[0,2,0,False],[0,3,0,False]]
def features_string_ulong_modular(start=0,order=2,gap=0,rev=False):
from shogun.Features import StringCharFeatures, StringUlongFeatures, RAWBYTE
from numpy import array, uint64
#create string features
cf=StringCharFeatures(['hey','guys','string'], RAWBYTE)
uf=StringUlongFeatures(RAWBYTE)
uf.obtain_from_char(cf, start,order,gap,rev)
#replace string 0
uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0)
return uf.get_features(),uf.get_feature_vector(2), uf.get_num_vectors()
if __name__=='__main__':
print 'simple_longint'
features_string_ulong_modular(*parameter_list[0])
# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 16bit (word).
strings=['hey','guys','string']
parameter_list=[[strings,0,2,0,False]]
def features_string_word_modular(strings, start, order, gap, rev):
from shogun.Features import StringCharFeatures, StringWordFeatures, RAWBYTE
from numpy import array, uint16
#create string features
cf=StringCharFeatures(strings, RAWBYTE)
wf=StringWordFeatures(RAWBYTE)
wf.obtain_from_char(cf, start, order, gap, rev)
#and output several stats
#print "max string length", wf.get_max_vector_length()
#print "number of strings", wf.get_num_vectors()
#print "length of first string", wf.get_vector_length(0)
#print "string[2]", wf.get_feature_vector(2)
#print "strings", wf.get_features()
#replace string 0
wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0)
#print "strings", wf.get_features()
return wf.get_features(), wf
if __name__=='__main__':
print 'StringWordFeatures'
features_string_word_modular(*parameter_list[0])
# This example demonstrates the use of the AUC Kernel.
###########################################################################
# kernel can be used to maximize AUC instead of margin in SVMs
###########################################################################
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,1.7], [traindat,testdat,1.6]]
def kernel_auc_modular(fm_train_real=traindat,label_train_real=testdat,width=1.7):
from shogun.Kernel import GaussianKernel, AUCKernel
from shogun.Features import RealFeatures, Labels
feats_train=RealFeatures(fm_train_real)
subkernel=GaussianKernel(feats_train, feats_train, width)
kernel=AUCKernel(0, subkernel)
kernel.setup_auc_maximization( Labels(label_train_real) )
km_train=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print 'AUC'
kernel_auc_modular(*parameter_list[0])
# This is an example for the initialization of the chi2-kernel on real data, where
# each column of the matrices corresponds to one training/test example.
###########################################################################
# chi2 kernel
###########################################################################
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = double(lm.load_numbers('../data/fm_test_real.dat'))
parameter_list = [[traindat,testdat,1.4,10], [traindat,testdat,1.5,10]]
def kernel_chi2_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4, size_cache=10):
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Chi2'
kernel_chi2_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list= [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]
def kernel_combined_custom_poly_modular(fm_train_real = traindat,fm_test_real = testdat,fm_label_twoclass=label_traindat):
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import LibSVM
kernel = CombinedKernel()
feats_train = CombinedFeatures()
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_train = RealFeatures(fm_train_real)
feats_train.append_feature_obj(subkfeats_train)
subkernel = PolyKernel(10,2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(fm_label_twoclass)
svm = LibSVM(1.0, kernel, labels)
svm.train()
kernel = CombinedKernel()
feats_pred = CombinedFeatures()
pfeats = RealFeatures(fm_test_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, pfeats)
K = tkernel.get_kernel_matrix()
kernel.append_kernel(CustomKernel(K))
subkfeats_test = RealFeatures(fm_test_real)
feats_pred.append_feature_obj(subkfeats_test)
subkernel = PolyKernel(10, 2)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_pred)
svm.set_kernel(kernel)
svm.classify()
km_train=kernel.get_kernel_matrix()
return km_train,kernel
if __name__=='__main__':
kernel_combined_custom_poly_modular(*parameter_list[0])
# This is an example for the initialization of a combined kernel, which is a weighted sum of
# in this case three kernels on real valued data. The sub-kernel weights are all set to 1.
#
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = double(lm.load_numbers('../data/fm_test_real.dat'))
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,traindna,testdna],[traindat,testdat,traindna,testdna]]
def kernel_combined_modular(fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA
kernel=CombinedKernel()
feats_train=CombinedFeatures()
feats_test=CombinedFeatures()
subkfeats_train=RealFeatures(fm_train_real)
subkfeats_test=RealFeatures(fm_test_real)
subkernel=GaussianKernel(10, 1.1)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
degree=3
subkernel=FixedDegreeStringKernel(10, degree)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
subkernel=LocalAlignmentStringKernel(10)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Combined'
kernel_combined_modular(*parameter_list[0])
# This is an example for the initialization of the CommUlongString-kernel. This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat =lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,0,False ],[traindat,testdat,4,0,False]]
def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortUlongString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'CommUlongString'
kernel_comm_ulong_string_modular(*parameter_list[0])
# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,4,0,False, False],[traindat,testdat,4,0,False,False]]
def kernel_comm_word_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse = False, use_sign = False):
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'CommWordString'
kernel_comm_word_string_modular(*parameter_list[0])
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # parameter_list =[[23],[24]] def kernel_const_modular (c=23): from shogun.Features import DummyFeatures from shogun.Kernel import ConstKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) kernel=ConstKernel(feats_train, feats_train, c) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print 'Const' kernel_const_modular(*parameter_list[0])
# A user defined custom kernel is assigned in this example, for which only the lower triangle # may be given (set_triangle_kernel_matrix_from_triangle) or # a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a # triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and # the svm is used to classify the examples. # from numpy.random import seed seed(42) parameter_list=[[7],[8]] def kernel_custom_modular (dim=7): from numpy.random import rand, seed from numpy import array, float32 from shogun.Features import RealFeatures from shogun.Kernel import CustomKernel seed(17) data=rand(dim, dim) feats=RealFeatures(data) symdata=data+data.T lowertriangle=array([symdata[(x,y)] for x in xrange(symdata.shape[1]) for y in xrange(symdata.shape[0]) if y<=x]) kernel=CustomKernel() # once with float64's kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() # now once with float32's data=array(data,dtype=float32) kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() return km_fullfull,kernel if __name__=='__main__': print 'Custom' kernel_custom_modular(*parameter_list[0])
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. parameter_list =[[23],[24]] def kernel_diag_modular (diag=23): from shogun.Features import DummyFeatures from shogun.Kernel import DiagKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) kernel=DiagKernel(feats_train, feats_train, diag) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print 'Diag' kernel_diag_modular(*parameter_list[0])
# With the distance kernel one can use any of the following distance metrics:
# BrayCurtisDistance()
# CanberraMetric()
# CanberraWordDistance()
# ChebyshewMetric()
# ChiSquareDistance()
# CosineDistance()
# Distance()
# EuclidianDistance()
# GeodesicMetric()
# HammingWordDistance()
# JensenMetric()
# ManhattanMetric()
# ManhattanWordDistance()
# MinkowskiMetric()
# RealDistance()
# SimpleDistance()
# SparseDistance()
# SparseEuclidianDistance()
# StringDistance()
# TanimotoDistance()
#
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()
traindat = double(lm.load_numbers('../data/fm_test_real.dat'))
testdat = double(lm.load_numbers('../data/fm_train_real.dat'))
parameter_list=[[traindat,testdat,1.7],[traindat,testdat,1.8]]
def kernel_distance_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.7):
from shogun.Kernel import DistanceKernel
from shogun.Features import RealFeatures
from shogun.Distance import EuclidianDistance
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
distance=EuclidianDistance()
kernel=DistanceKernel(feats_train, feats_test, width, distance)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Distance'
kernel_distance_modular(*parameter_list[0])
# The class FKFeatures implements Fischer kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that FK-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1,4,1e-1,1,0,False,[1,False,True]],[traindat,testdat,label_traindat,3,4,1e-1,1,0,False,[1,False,True]]]
fm_hmm_pos=[ traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[ traindat[i] for i in where([label_traindat==-1])[1] ]
def kernel_fisher_modular(fm_train_dna=traindat, fm_test_dna=testdat,
label_train_dna=label_traindat,
N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
kargs=[1,False,True]):
from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL#, MSG_DEBUG
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
#charfeat.io.set_loglevel(MSG_DEBUG)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=FKFeatures(10, pos, neg)
feats_train.set_opt_a(-1) #estimate prior
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=FKFeatures(10, pos_clone, neg_clone)
feats_test.set_a(feats_train.get_a()) #use prior from training data
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print "Fisher Kernel"
kernel_fisher_modular(*parameter_list[0])
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat, testdat,3],[traindat,testdat,4]]
def kernel_fixed_degree_string_modular (fm_train_dna=traindat, fm_test_dna=testdat,degree=3):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import FixedDegreeStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'FixedDegreeString'
kernel_fixed_degree_string_modular(*parameter_list[0])
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features.
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat, 1.3],[traindat,testdat, 1.4]]
def kernel_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat, width=1.3):
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Gaussian'
kernel_gaussian_modular(*parameter_list[0])
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel.
# The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step
# size of the shifts and max_shift is the maximal shift.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,1.8,2,1],[traindat,testdat,1.9,2,1]]
def kernel_gaussian_shift_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.8,max_shift=2,shift_step=1):
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianShiftKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'GaussianShift'
kernel_gaussian_shift_modular(*parameter_list[0])
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindat,testdat,label_traindat,3,0,False],[traindat,testdat,label_traindat,3,0,False]]
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,gap=0,reverse=False):
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import HistogramWordStringKernel
from shogun.Classifier import PluginEstimate#, MSG_DEBUG
reverse = reverse
charfeat=StringCharFeatures(DNA)
#charfeat.io.set_loglevel(MSG_DEBUG)
charfeat.set_features(fm_train_dna)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'PluginEstimate w/ HistogramWord'
kernel_histogram_word_string_modular(*parameter_list[0])
# example on saving a kernel to a file
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,1.9],[traindat,testdat,1.7]]
def kernel_io_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.9):
from shogun.Features import RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Library import AsciiFile, BinaryFile
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_train.ascii","w")
kernel.save(f)
del f
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
f=AsciiFile("gaussian_test.ascii","w")
kernel.save(f)
del f
#clean up
import os
os.unlink("gaussian_test.ascii")
os.unlink("gaussian_train.ascii")
return km_train, km_test, kernel
if __name__=='__main__':
print 'Gaussian'
kernel_io_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on raw byte
# data.
###########################################################################
# linear kernel on byte features
###########################################################################
from tools.load import LoadMatrix
from numpy import ubyte
lm=LoadMatrix()
traindat = ubyte(lm.load_numbers('../data/fm_train_byte.dat'))
testdat = ubyte(lm.load_numbers('../data/fm_test_byte.dat'))
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_linear_byte_modular(fm_train_byte=traindat,fm_test_byte=testdat):
from shogun.Kernel import LinearKernel
from shogun.Features import ByteFeatures
feats_train=ByteFeatures(fm_train_byte)
feats_test=ByteFeatures(fm_test_byte)
kernel=LinearKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print 'LinearByte'
kernel_linear_byte_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on real valued
# data using scaling factor 1.2.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]
def kernel_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
from shogun.Features import RealFeatures
from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Linear'
kernel_linear_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on string data. The
# strings are all of the same length and consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LinearStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LinearStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
from tools.load import LoadMatrix
print 'LinearString'
kernel_linear_string_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte)
# data.
from tools.load import LoadMatrix
from numpy import ushort
lm=LoadMatrix()
traindat = ushort(lm.load_numbers('../data/fm_train_word.dat'))
testdat = ushort(lm.load_numbers('../data/fm_test_word.dat'))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.2]]
def kernel_linear_word_modular (fm_train_word=traindat,fm_test_word=testdat,scale=1.2):
from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
from shogun.Features import WordFeatures
feats_train=WordFeatures(fm_train_word)
feats_test=WordFeatures(fm_test_word)
kernel=LinearKernel(feats_train, feats_train)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return kernel
if __name__=='__main__':
print 'LinearWord'
kernel_linear_word_modular(*parameter_list[0])
# This is an example for the initialization of the local alignment kernel on
# DNA sequences, where each column of the matrices of type char corresponds to
# one training/test example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]
def kernel_local_alignment_string_modular(fm_train_dna=traindat,fm_test_dna=testdat):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalAlignmentStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalAlignmentStringKernel(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'LocalAlignmentString'
kernel_local_alignment_string_modular(*parameter_list[0])
# The LocalityImprovedString kernel is inspired by the polynomial kernel.
# Comparing neighboring characters it puts emphasize on local features.
#
# It can be defined as
# K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2},
# where
# I_i({\bf x},{\bf x'})=1
# if $x_i=x'_i and 0 otherwise.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat,5,5,7],[traindat,testdat,5,5,7]]
def kernel_locality_improved_string_modular(fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import LocalityImprovedStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=LocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'LocalityImprovedString'
kernel_locality_improved_string_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat, 3,1.4,10,3,0,False],[
traindat,testdat, 3,1.4,10,3,0,False]]
def kernel_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False):
from shogun.Kernel import MatchWordStringKernel, AvgDiagKernelNormalizer
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=MatchWordStringKernel(size_cache, degree)
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'MatchWordString'
kernel_match_word_string_modular(*parameter_list[0])
# This is an example initializing the oligo string kernel which takes distances
# between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length
# of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is
# implemented for the DNA-alphabet 'ACGT'.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,1.2,10],[traindat,testdat,4,1.3,10]]
def kernel_oligo_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,k=3,width=1.2,size_cache=10):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import OligoStringKernel
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=OligoStringKernel(size_cache, k, width)
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'OligoString'
kernel_oligo_string_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,False],[traindat,testdat,4,False]]
def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False):
from shogun.Kernel import PolyMatchStringKernel
from shogun.Features import StringCharFeatures, DNA
feats_train=StringCharFeatures(fm_train_dna, DNA)
feats_test=StringCharFeatures(fm_train_dna, DNA)
kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'PolyMatchString'
kernel_poly_match_string_modular(*parameter_list[0])
# This is an example for the initialization of the PolyMatchString kernel on string data.
# The PolyMatchString kernel sums over the matches of two stings of the same length and
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,2,True,3,0,False],[traindat,testdat,2,True,3,0,False]]
def kernel_poly_match_word_string_modular(fm_train_dna=traindat,fm_test_dna=testdat,
degree=2,inhomogene=True,order=3,gap=0,reverse=False):
from shogun.Kernel import PolyMatchWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'PolyMatchWordString'
kernel_poly_match_word_string_modular(*parameter_list[0])
# This example initializes the polynomial kernel with real data.
# If variable 'inhomogene' is 'True' +1 is added to the scalar product
# before taking it to the power of 'degree'. If 'use_normalization' is
# set to 'true' then kernel matrix will be normalized by the square roots
# of the diagonal entries.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,4,False,True],[traindat,testdat,5,False,True]]
def kernel_poly_modular (fm_train_real=traindat,fm_test_real=testdat,degree=4,inhomogene=False,
use_normalization=True):
from shogun.Features import RealFeatures
from shogun.Kernel import PolyKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=PolyKernel(
feats_train, feats_train, degree, inhomogene, use_normalization)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Poly'
kernel_poly_modular (*parameter_list[0])
# The SalzbergWordString kernel implements the Salzberg kernel.
#
# It is described in
#
# Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites
# A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,3,0,False],[traindat,testdat,label_traindat,3,0,False]]
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels
from shogun.Kernel import SalzbergWordStringKernel
from shogun.Classifier import PluginEstimate
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
pie=PluginEstimate()
labels=Labels(label_train_dna)
pie.set_labels(labels)
pie.set_features(feats_train)
pie.train()
kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
pie.set_features(feats_test)
pie.classify().get_labels()
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'PluginEstimate w/ SalzbergWord'
kernel_salzberg_word_string_modular(*parameter_list[0])
# The standard Sigmoid kernel computed on dense real valued features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,10,1.2,1.3],[traindat,testdat,10,1.2,1.3]]
def kernel_sigmoid_modular(fm_train_real=traindat,fm_test_real=testdat,size_cache=10,gamma=1.2,coef0=1.3):
from shogun.Features import RealFeatures
from shogun.Kernel import SigmoidKernel
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'Sigmoid'
kernel_sigmoid_modular(*parameter_list[0])
# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,5,5,1],[traindat,testdat,5,3,2]]
def kernel_simple_locality_improved_string_modular(fm_train_dna=traindat,fm_test_dna=testdat,
length=5,inner_degree=5,outer_degree=1 ):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import SimpleLocalityImprovedStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=SimpleLocalityImprovedStringKernel(
feats_train, feats_train, length, inner_degree, outer_degree)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'SimpleLocalityImprovedString'
kernel_simple_locality_improved_string_modular(*parameter_list[0])
# This example demonstrates how to use the Gaussian Kernel with sparse features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]
def kernel_sparse_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.1 ):
from shogun.Features import SparseRealFeatures
from shogun.Kernel import GaussianKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=GaussianKernel(feats_train, feats_train, width)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'SparseGaussian'
kernel_sparse_gaussian_modular (*parameter_list[0])
# This example demonstrates how to use the Linear Kernel with sparse features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]
def kernel_sparse_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.1):
from shogun.Features import SparseRealFeatures
from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=LinearKernel()
kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
kernel.init(feats_train, feats_train)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'SparseLinear'
kernel_sparse_linear_modular(*parameter_list[0])
# This example shows how to use the polynomial kernel with sparse features.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,10,3,True],[traindat,testdat,10,4,True]]
def kernel_sparse_poly_modular (fm_train_real=traindat,fm_test_real=testdat,
size_cache=10,degree=3,inhomogene=True ):
from shogun.Features import SparseRealFeatures
from shogun.Kernel import PolyKernel
feats_train=SparseRealFeatures(fm_train_real)
feats_test=SparseRealFeatures(fm_test_real)
kernel=PolyKernel(feats_train, feats_train, size_cache, degree,
inhomogene)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'SparsePoly'
kernel_sparse_poly_modular(*parameter_list[0])
# The class TOPFeatures implements TOP kernel features obtained from
# two Hidden Markov models.
#
# It was used in
#
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
#
# which also has the details.
#
# Note that TOP-features are computed on the fly, so to be effective feature
# caching should be enabled.
#
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
#
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
fm_hmm_pos=[traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[traindat[i] for i in where([label_traindat==-1])[1] ]
parameter_list = [[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True]], \
[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True] ]]
def kernel_top_modular(fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
order=1,gap=0,reverse=False,kargs=[1, False, True]):
from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
from shogun.Kernel import PolyKernel
from shogun.Distribution import HMM, BW_NORMAL
N=1 # toy HMM with 1 state
M=4 # 4 observations -> DNA
# train HMM for positive class
charfeat=StringCharFeatures(fm_hmm_pos, DNA)
hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
pos=HMM(hmm_pos_train, N, M, pseudo)
pos.baum_welch_viterbi_train(BW_NORMAL)
# train HMM for negative class
charfeat=StringCharFeatures(fm_hmm_neg, DNA)
hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
neg=HMM(hmm_neg_train, N, M, pseudo)
neg.baum_welch_viterbi_train(BW_NORMAL)
# Kernel training data
charfeat=StringCharFeatures(fm_train_dna, DNA)
wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
# Kernel testing data
charfeat=StringCharFeatures(fm_test_dna, DNA)
wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
# get kernel on training data
pos.set_observations(wordfeats_train)
neg.set_observations(wordfeats_train)
feats_train=TOPFeatures(10, pos, neg, False, False)
kernel=PolyKernel(feats_train, feats_train, *kargs)
km_train=kernel.get_kernel_matrix()
# get kernel on testing data
pos_clone=HMM(pos)
neg_clone=HMM(neg)
pos_clone.set_observations(wordfeats_test)
neg_clone.set_observations(wordfeats_test)
feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print "TOP Kernel"
kernel_top_modular(*parameter_list[0])
# The WeightedCommWordString kernel may be used to compute the weighted
# spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer
# length is weighted by some coefficient \f$\beta_k\f$) from strings that have
# been mapped into unsigned 16bit integers.
#
# These 16bit integers correspond to k-mers. To applicable in this kernel they
# need to be sorted (e.g. via the SortWordString pre-processor).
#
# It basically uses the algorithm in the unix "comm" command (hence the name)
# to compute:
#
# k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'})
#
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
#
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
#
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat],[traindat,testdat]]
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
from shogun.Kernel import WeightedCommWordStringKernel
from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
from shogun.PreProc import SortWordString
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
use_sign=False
kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'WeightedCommWordString'
kernel_weighted_comm_word_string_modular(*parameter_list[0])
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
#
# The WD-shift kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d, weighted by coefficients beta_k
# allowing for a positional tolerance of up to shift s.
#
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,20],[traindat,testdat,22]]
def kernel_weighted_degree_position_string_modular(fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreePositionStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree)
from numpy import zeros,ones,float64,int32
#kernel.set_shifts(zeros(len(fm_train_dna[0]), dtype=int32))
#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'WeightedDegreePositionString'
kernel_weighted_degree_position_string_modular(*parameter_list[0])
# This examples shows how to create a Weighted Degree String Kernel from data
# and how to compute the kernel matrix from the resulting object.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3],[traindat,testdat,20]]
def kernel_weighted_degree_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
from shogun.Features import StringCharFeatures, DNA
from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG
feats_train=StringCharFeatures(fm_train_dna, DNA)
#feats_train.io.set_loglevel(MSG_DEBUG)
feats_test=StringCharFeatures(fm_test_dna, DNA)
kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)
from numpy import arange,double
weights=arange(1,degree+1,dtype=double)[::-1]/ \
sum(arange(1,degree+1,dtype=double))
kernel.set_wd_weights(weights)
#from numpy import ones,float64,int32
#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
#this is how to serializate the kernel
#import pickle
#pickle.dump(kernel, file('kernel_obj.dump','w'), protocol=2)
#k=pickle.load(file('kernel_obj.dump','r'))
return km_train, km_test, kernel
if __name__=='__main__':
print 'WeightedDegreeString'
kernel_weighted_degree_string_modular(*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface. First, we create a number of base kernels.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLClassifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time.
# Finally, the example shows how to classify with a trained MKLClassifier.
#
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel
from shogun.Classifier import MKLClassification
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]
# fm_train_real.shape
# fm_test_real.shape
# combined_custom()
def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat):
##################################
# set up and train
# create some poly train/test matrix
tfeats = RealFeatures(fm_train_real)
tkernel = PolyKernel(10,3)
tkernel.init(tfeats, tfeats)
K_train = tkernel.get_kernel_matrix()
pfeats = RealFeatures(fm_test_real)
tkernel.init(tfeats, pfeats)
K_test = tkernel.get_kernel_matrix()
# create combined train features
feats_train = CombinedFeatures()
feats_train.append_feature_obj(RealFeatures(fm_train_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_train))
kernel.append_kernel(PolyKernel(10,2))
kernel.init(feats_train, feats_train)
# train mkl
labels = Labels(fm_label_twoclass)
mkl = MKLClassification()
# which norm to use for MKL
mkl.set_mkl_norm(1) #2,3
# set cost (neg, pos)
mkl.set_C(1, 1)
# set kernel and labels
mkl.set_kernel(kernel)
mkl.set_labels(labels)
# train
mkl.train()
#w=kernel.get_subkernel_weights()
#kernel.set_subkernel_weights(w)
##################################
# test
# create combined test features
feats_pred = CombinedFeatures()
feats_pred.append_feature_obj(RealFeatures(fm_test_real))
# and corresponding combined kernel
kernel = CombinedKernel()
kernel.append_kernel(CustomKernel(K_test))
kernel.append_kernel(PolyKernel(10, 2))
kernel.init(feats_train, feats_pred)
# and classify
mkl.set_kernel(kernel)
mkl.classify()
return mkl.classify(),kernel
if __name__=='__main__':
mkl_binclass_modular (*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface for multi-class classification.
# First, we create a number of base kernels and features.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary
# classification version of MKL is that we can use more than two values as labels, when training
# the classifier.
# Finally, the example shows how to classify with a trained MKLMultiClass classifier.
#
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')
parameter_list=[
[ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5],
[ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2]]
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
width, C, epsilon, num_threads, mkl_epsilon, mkl_norm):
from shogun.Features import CombinedFeatures, RealFeatures, Labels
from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
from shogun.Classifier import MKLMultiClass
kernel = CombinedKernel()
feats_train = CombinedFeatures()
feats_test = CombinedFeatures()
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = GaussianKernel(10, width)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = LinearKernel()
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
subkfeats_train = RealFeatures(fm_train_real)
subkfeats_test = RealFeatures(fm_test_real)
subkernel = PolyKernel(10,2)
feats_train.append_feature_obj(subkfeats_train)
feats_test.append_feature_obj(subkfeats_test)
kernel.append_kernel(subkernel)
kernel.init(feats_train, feats_train)
labels = Labels(label_train_multiclass)
mkl = MKLMultiClass(C, kernel, labels)
mkl.set_epsilon(epsilon);
mkl.parallel.set_num_threads(num_threads)
mkl.set_mkl_epsilon(mkl_epsilon)
mkl.set_mkl_norm(mkl_norm)
mkl.train()
kernel.init(feats_train, feats_test)
out = mkl.classify().get_labels()
return out
if __name__ == '__main__':
print 'mkl_multiclass'
mkl_multiclass_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat+10,testdat+10,1.4,10],[traindat+10,testdat+10,1.5,10]]
def preproc_logplusone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import LogPlusOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=LogPlusOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'LogPlusOne'
preproc_logplusone_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# NormOne, normalizes vectors to have norm 1.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.4,10],[traindat,testdat,1.5,10]]
def preproc_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import NormOne
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=NormOne()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'NormOne'
preproc_normone_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# PruneVarSubMean substracts the mean from each feature and removes features that
# have zero variance.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]]
def preproc_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
from shogun.Kernel import Chi2Kernel
from shogun.Features import RealFeatures
from shogun.PreProc import PruneVarSubMean
feats_train=RealFeatures(fm_train_real)
feats_test=RealFeatures(fm_test_real)
preproc=PruneVarSubMean()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'PruneVarSubMean'
preproc_prunevarsubmean_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The
# CommUlongString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 64bit integers. These 64bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortUlongString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation enables
# spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for
# 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented
# (though there is room for improvement here when a whole set of sequences is
# ADDed) using sorted lists.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,4,0,False,False],[traindna,testdna,3,0,False,False]]
def preproc_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):
from shogun.Kernel import CommUlongStringKernel
from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA
from shogun.PreProc import SortUlongString
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_train_dna)
feats_train=StringUlongFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
charfeat=StringCharFeatures(DNA)
charfeat.set_features(fm_test_dna)
feats_test=StringUlongFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortUlongString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'CommUlongString'
preproc_sortulongstring_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The
# CommWordString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 16bit integers. These 16bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortWordString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation is
# especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it
# enables spectrum kernels of order up to 8. For this kernel the linadd speedups
# are quite efficiently implemented using direct maps.
from tools.load import LoadMatrix
lm=LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindna,testdna,3,0,False,False],[traindna,testdna,3,0,False,False]]
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):
from shogun.Kernel import CommWordStringKernel
from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
from shogun.PreProc import SortWordString
charfeat=StringCharFeatures(fm_train_dna, DNA)
feats_train=StringWordFeatures(charfeat.get_alphabet())
feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preproc(preproc)
feats_train.apply_preproc()
charfeat=StringCharFeatures(fm_test_dna, DNA)
feats_test=StringWordFeatures(charfeat.get_alphabet())
feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
feats_test.add_preproc(preproc)
feats_test.apply_preproc()
kernel=CommWordStringKernel(feats_train, feats_train, use_sign)
km_train=kernel.get_kernel_matrix()
kernel.init(feats_train, feats_test)
km_test=kernel.get_kernel_matrix()
return km_train,km_test,kernel
if __name__=='__main__':
print 'CommWordString'
preproc_sortwordstring_modular(*parameter_list[0])
# In this example a kernelized version of ridge regression (KRR) is trained on a
# real-valued data set. The KRR is trained with regularization parameter tau=1e-6
# and a gaussian kernel with width=0.8. The labels of both the train and the test
# data can be fetched via krr.classify().get_labels().
###########################################################################
# kernel ridge regression
###########################################################################
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,0.8,1e-6],[traindat,testdat,label_traindat,0.9,1e-7]]
def regression_krr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,width=0.8,tau=1e-6):
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train)
krr=KRR(tau, kernel, labels)
krr.train(feats_train)
kernel.init(feats_train, feats_test)
out = krr.classify().get_labels()
return out,kernel,krr
# equivialent shorter version
def krr_short ():
print 'KRR_short'
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import KRR
width=0.8; tau=1e-6
krr=KRR(tau, GaussianKernel(0, width), Labels(label_train))
krr.train(RealFeatures(fm_train))
out = krr.classify(RealFeatures(fm_test)).get_labels()
return krr,out
if __name__=='__main__':
print 'KRR'
regression_krr_modular(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The labels of both the train and the test data are
# fetched via svr.classify().get_labels().
#
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ .
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \
[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]]
def regression_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
from shogun.Regression import LibSVR
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train)
svr=LibSVR(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.train()
kernel.init(feats_train, feats_test)
out1=svr.classify().get_labels()
out2=svr.classify(feats_test).get_labels()
return out1,out2,kernel
if __name__=='__main__':
print 'LibSVR'
regression_libsvr_modular(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The the label of both the train and the test data are
# fetched via svr.classify().get_labels().
#
# For more details on the SVM^light see
# T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
# Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
###########################################################################
# svm light based support vector regression
###########################################################################
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')
parameter_list = [[traindat,testdat,label_traindat,1.2,1,1e-5,1e-2,3],[traindat,testdat,label_traindat,2.3,0.5,1e-5,1e-6,1]]
def regression_svrlight_modular(fm_train=traindat,fm_test=testdat,label_train=label_traindat, \
width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3):
from shogun.Features import Labels, RealFeatures
from shogun.Kernel import GaussianKernel
try:
from shogun.Regression import SVRLight
except ImportError:
print 'No support for SVRLight available.'
return
feats_train=RealFeatures(fm_train)
feats_test=RealFeatures(fm_test)
kernel=GaussianKernel(feats_train, feats_train, width)
labels=Labels(label_train)
svr=SVRLight(C, epsilon, kernel, labels)
svr.set_tube_epsilon(tube_epsilon)
svr.parallel.set_num_threads(num_threads)
svr.train()
kernel.init(feats_train, feats_test)
out = svr.classify().get_labels()
return out, kernel
if __name__=='__main__':
print 'SVRLight'
regression_svrlight_modular(*parameter_list[0])
parameter_list=[[5,1,10, 2.0, 10], [10,0.3,2, 1.0, 0.1]]
def check_status(status):
# silent...
assert(status)
#if status:
# print "OK reading/writing .h5\n"
#else:
# print "ERROR reading/writing .h5\n"
def serialization_complex_example(num=5, dist=1, dim=10, C=2.0, width=10):
import os
from numpy import concatenate, zeros, ones
from numpy.random import randn, seed
from shogun.Features import RealFeatures, Labels
from shogun.Classifier import GMNPSVM
from shogun.Kernel import GaussianKernel
from shogun.Library import SerializableHdf5File,SerializableAsciiFile, \
SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
from shogun.PreProc import NormOne, LogPlusOne
seed(17)
data=concatenate((randn(dim, num), randn(dim, num) + dist,
randn(dim, num) + 2*dist,
randn(dim, num) + 3*dist), axis=1)
lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))
feats=RealFeatures(data)
#feats.io.set_loglevel(MSG_DEBUG)
kernel=GaussianKernel(feats, feats, width)
labels=Labels(lab)
svm = GMNPSVM(C, kernel, labels)
feats.add_preproc(NormOne())
feats.add_preproc(LogPlusOne())
feats.set_preprocessed(1)
svm.train(feats)
#svm.print_serializable()
fstream = SerializableHdf5File("blaah.h5", "w")
status = svm.save_serializable(fstream)
check_status(status)
fstream = SerializableAsciiFile("blaah.asc", "w")
status = svm.save_serializable(fstream)
check_status(status)
fstream = SerializableJsonFile("blaah.json", "w")
status = svm.save_serializable(fstream)
check_status(status)
fstream = SerializableXmlFile("blaah.xml", "w")
status = svm.save_serializable(fstream)
check_status(status)
fstream = SerializableHdf5File("blaah.h5", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status)
new_svm.train()
fstream = SerializableAsciiFile("blaah.asc", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status)
new_svm.train()
fstream = SerializableJsonFile("blaah.json", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status)
new_svm.train()
fstream = SerializableXmlFile("blaah.xml", "r")
new_svm=GMNPSVM()
status = new_svm.load_serializable(fstream)
check_status(status)
new_svm.train()
os.unlink("blaah.h5")
os.unlink("blaah.asc")
os.unlink("blaah.json")
os.unlink("blaah.xml")
return svm,new_svm
if __name__=='__main__':
print 'Serialization SVMLight'
serialization_complex_example(*parameter_list[0])
# This example shows how to use boost serialization (only available if the compile flag was enabled)
# to serialize/deserialize an SVMLight object. Note that this code is in alpha state.
parameter_list=[[10, 1, 2.1, 2.0]]
def serialization_svmlight_modular(num, dist, width, C):
from shogun.Library import MSG_DEBUG
from shogun.Features import RealFeatures, Labels, DNA, Alphabet
from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel
from shogun.Classifier import SVMLight
from numpy import concatenate, ones
from numpy.random import randn, seed
import sys
import types
import random
import bz2
import cPickle as pickle
import inspect
def save(filename, myobj):
"""
save object to file using pickle
@param filename: name of destination file
@type filename: str
@param myobj: object to save (has to be pickleable)
@type myobj: obj
"""
try:
f = bz2.BZ2File(filename, 'wb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be written\n')
sys.stderr.write(details)
return
pickle.dump(myobj, f, protocol=2)
f.close()
def load(filename):
"""
Load from filename using pickle
@param filename: name of file to load from
@type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except IOError, details:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = pickle.load(f)
f.close()
return myobj
##################################################
seed(17)
traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);
trainlab=concatenate((-ones(num), ones(num)));
testlab=concatenate((-ones(num), ones(num)));
feats_train=RealFeatures(traindata_real);
feats_test=RealFeatures(testdata_real);
kernel=GaussianKernel(feats_train, feats_train, width);
#kernel.io.set_loglevel(MSG_DEBUG)
labels=Labels(trainlab);
svm=SVMLight(C, kernel, labels)
svm.train()
#svm.io.set_loglevel(MSG_DEBUG)
##################################################
#print "labels:"
#print pickle.dumps(labels)
#
#print "features"
#print pickle.dumps(feats_train)
#
#print "kernel"
#print pickle.dumps(kernel)
#
#print "svm"
#print pickle.dumps(svm)
#
#print "#################################"
fn = "serialized_svm.bz2"
#print "serializing SVM to file", fn
save(fn, svm)
#print "#################################"
#print "unserializing SVM"
svm2 = load(fn)
#print "#################################"
#print "comparing training"
svm2.train()
#print "objective before serialization:", svm.get_objective()
#print "objective after serialization:", svm2.get_objective()
return svm, svm.get_objective(), svm2, svm2.get_objective()
if __name__=='__main__':
print 'Serialization SVMLight'
serialization_svmlight_modular(*parameter_list[0])
# In this example we use the dynamic progaramm implementation with a
# gene finding specific model. The model and the training parameter
# are stored in a file and are used to create a gene prediction on
# some example sequence.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
parameter_list=[['../data/DynProg_example_py.pickle.gz']]
from shogun.Structure import *
import numpy
from numpy import array,Inf,float64,matrix,frompyfunc,zeros
#from IPython.Shell import IPShellEmbed
#ipshell = IPShellEmbed()
import gzip
import scipy
from scipy.io import loadmat
import pickle
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
if scipy.__version__ >= '0.7.0':
renametable = {
'scipy.io.mio5': 'scipy.io.matlab.mio5',
'scipy.sparse.sparse' : 'scipy.sparse',
}
else:
renametable = {}
def mapname(name):
if name in renametable:
return renametable[name]
return name
def mapped_load_global(self):
module = mapname(self.readline()[:-1])
name = mapname(self.readline()[:-1])
klass = self.find_class(module, name)
self.append(klass)
def loads(str):
file = StringIO(str)
unpickler = pickle.Unpickler(file)
unpickler.dispatch[pickle.GLOBAL] = mapped_load_global
return unpickler.load()
def structure_dynprog_modular(fname):
data_dict = loads(gzip.GzipFile(fname).read())
#data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False)
#print data_dict
#print len(data_dict['penalty_array'][0][0][0][0].limits[0])
num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits)
pm = PlifMatrix()
pm.create_plifs(num_plifs,num_limits)
ids = numpy.array(range(num_plifs),dtype=numpy.int32)
min_values = numpy.array(range(num_plifs),dtype=numpy.float64)
max_values = numpy.array(range(num_plifs),dtype=numpy.float64)
all_use_cache = numpy.array(range(num_plifs),dtype=numpy.bool)
all_use_svm = numpy.array(range(num_plifs),dtype=numpy.int32)
all_limits = zeros((num_plifs,num_limits))
all_penalties = zeros((num_plifs,num_limits))
all_names = ['']*num_plifs
all_transforms = ['']*num_plifs
for plif_idx in range(num_plifs):
ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1
min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value
max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value
all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache
all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm
all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits
all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties
all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name)
all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform)
if all_transforms[plif_idx] == '[]':
all_transforms[plif_idx] = 'linear'
pm.set_plif_ids(ids)
pm.set_plif_min_values(min_values)
pm.set_plif_max_values(max_values)
pm.set_plif_use_cache(all_use_cache)
pm.set_plif_use_svm(all_use_svm)
pm.set_plif_limits(all_limits)
pm.set_plif_penalties(all_penalties)
#pm.set_plif_names(all_names)
#pm.set_plif_transform_type(all_transforms)
transition_ptrs = data_dict['model'].transition_pointers
transition_ptrs = transition_ptrs[:,:,0:2]
transition_ptrs = transition_ptrs.astype(numpy.float64)
pm.compute_plif_matrix(transition_ptrs)
# init_dyn_prog
num_svms = 8
dyn = DynProg(num_svms)
orf_info = data_dict['model'].orf_info
orf_info = orf_info.astype(numpy.int32)
num_states = orf_info.shape[0]
dyn.set_num_states(num_states)
block = data_dict['block']
seq_len = len(block.seq)
seq = str(block.seq)
gene_string = array([elem for elem in seq])
# precompute_content_svms
pos = block.all_pos-1
pos = pos.astype(numpy.int32)
snd_pos = pos
dyn.set_pos(pos)
dyn.set_gene_string(gene_string)
dyn.create_word_string()
dyn.precompute_stop_codons()
dyn.init_content_svm_value_array(num_svms)
dict_weights = data_dict['content_weights']
dict_weights = dict_weights.reshape(8,1).astype(numpy.float64)
dict_weights = zeros((8,5440))
dyn.set_dict_weights(dict_weights.T)
dyn.precompute_content_values()
dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32))
pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32))
dyn.set_orf_info(orf_info)
#
p = data_dict['model'].p
q = data_dict['model'].q
dyn.set_p_vector(p)
dyn.set_q_vector(q)
a_trans = data_dict['a_trans']
a_trans = a_trans.astype(float64)
dyn.set_a_trans_matrix(a_trans)
dyn.check_svm_arrays()
features = data_dict['block'].features
dyn.set_observation_matrix(features)
dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64))
dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64))
use_orf = True
feat_dims = [25,201,2]
dyn.set_plif_matrices(pm);
dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False)
# fetch results
states = dyn.get_states()
#print states
scores = dyn.get_scores()
#print scores
positions = dyn.get_positions()
#print positions
return states, scores, positions
if __name__ == '__main__':
print "Structure"
structure_dynprog_modular(*parameter_list[0])
parameter_list=[[10,7,0,0]] def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun.PreProc import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K if __name__=='__main__': print 'Leak Check Comm Word Kernel' tests_check_commwordkernel_memleak_modular(*parameter_list[0])