This page lists ready to run shogun examples for the Python Modular interface.

To run the examples issue

python name_of_example.py

Classifier

examples/documented/python_modular/classifier_averaged_perceptron_modular.py

# In this example the Averaged Perceptron used to classify toy data.

#!/usr/bin/env python

from numpy import *

parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]]

def classifier_averaged_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1):
	from modshogun import RealFeatures, BinaryLabels
	from modshogun import AveragedPerceptron

	random.seed(seed)

	# produce some (probably) linearly separable training data by hand
	# Two Gaussians at a far enough distance
	X=array(random.randn(dim,n))+distance
	Y=array(random.randn(dim,n))-distance
	X_test=array(random.randn(dim,n))+distance
	Y_test=array(random.randn(dim,n))-distance
	label_train_twoclass=hstack((ones(n), -ones(n)))

	#plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o')
	fm_train_real=hstack((X,Y))
	fm_test_real=hstack((X_test,Y_test))

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=BinaryLabels(label_train_twoclass)

	perceptron=AveragedPerceptron(feats_train, labels)
	perceptron.set_learn_rate(learn_rate)
	perceptron.set_max_iter(max_iter)
	# only guaranteed to converge for separable data
	perceptron.train()

	perceptron.set_features(feats_test)
	out_labels = perceptron.apply().get_labels()
	return perceptron, out_labels

if __name__=='__main__':
	print('AveragedPerceptron')
	classifier_averaged_perceptron_modular(*parameter_list[0])

examples/documented/python_modular/classifier_custom_kernel_modular.py

# This example shows how to use a custom defined kernel function for training a
# two class Support Vector Machine (SVM) classifier on randomly generated
# examples. The SVM regularization constant is set to C=1.

#!/usr/bin/env python
parameter_list = [[1,7],[2,8]]

def classifier_custom_kernel_modular (C=1,dim=7):
	from modshogun import RealFeatures, BinaryLabels, CustomKernel, LibSVM
	from numpy import diag,ones,sign
	from numpy.random import rand,seed

	seed((C,dim))

	lab=sign(2*rand(dim) - 1)
	data=rand(dim, dim)
	symdata=data*data.T + diag(ones(dim))

	kernel=CustomKernel()
	kernel.set_full_kernel_matrix_from_full(data)
	labels=BinaryLabels(lab)
	svm=LibSVM(C, kernel, labels)
	svm.train()
	predictions =svm.apply()
	out=svm.apply().get_labels()
	return svm,out

if __name__=='__main__':
	print('custom_kernel')
	classifier_custom_kernel_modular(*parameter_list[0])

examples/documented/python_modular/classifier_domainadaptationsvm_modular.py

# In this example we demonstrate how to use SVMs in a domain adaptation
# scenario. Here, we assume that we have two problem domains, one with
# an abundance of training data (source domain) and one with only a few
# training examples (target domain). These domains are assumed to be
# different but related enough to transfer information between them.
# Thus, we first train an SVM on the source domain and then subsequently
# pass this previously trained SVM object to the DASVM, that we train
# on the target domain. The DASVM internally computes a custom linear term
# (for the underlying quadratic program of the dual formulation of the SVM)
# based on the support vectors of the source SVM and the training examples
# of the target SVM. Finally, it can be used for prediction just as any other
# SVM object.
# 

#!/usr/bin/env python
import numpy

from modshogun import StringCharFeatures, BinaryLabels, DNA
from modshogun import WeightedDegreeStringKernel
from modshogun import MSG_DEBUG
try:
	from modshogun import DomainAdaptationSVM
except ImportError:
	print("DomainAdaptationSVM not available")
	exit(0)

try:
	from modshogun import SVMLight
except ImportError:
	print("SVMLight not available")
	exit(0)

traindna = ['CGCACGTACGTAGCTCGAT',
		      'CGACGTAGTCGTAGTCGTA',
		      'CGACGGGGGGGGGGTCGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGACCACAGTTATATAGTA',
		      'CGACGTAGTCGTAGTCGTA',
		      'CGACGTAGTTTTTTTCGTA',
		      'CGACGTAGTCGTAGCCCCA',
		      'CAAAAAAAAAAAAAAAATA',
		      'CGACGGGGGGGGGGGCGTA']
label_traindna = numpy.array(5*[-1.0] + 5*[1.0])
testdna = ['AGCACGTACGTAGCTCGAT',
		      'AGACGTAGTCGTAGTCGTA',
		      'CAACGGGGGGGGGGTCGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGAACACAGTTATATAGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGACGTGGGGTTTTTCGTA',
		      'CGACGTAGTCCCAGCCCCA',
		      'CAAAAAAAAAAAACCAATA',
		      'CGACGGCCGGGGGGGCGTA']
label_testdna = numpy.array(5*[-1.0] + 5*[1.0])


traindna2 = ['AGACAGTCAGTCGATAGCT',
		      'AGCAGTCGTAGTCGTAGTC',
		      'AGCAGGGGGGGGGGTAGTC',
		      'AGCAATCGTAGTCGTAGTC',
		      'AGCAACACGTTCTCTCGTC',
		      'AGCAGTCGTAGTCGTAGTC',
		      'AGCAGTCGTTTTTTTAGTC',
		      'AGCAGTCGTAGTCGAAAAC',
		      'ACCCCCCCCCCCCCCCCTC',
		      'AGCAGGGGGGGGGGGAGTC']
label_traindna2 = numpy.array(5*[-1.0] + 5*[1.0])
testdna2 = ['CGACAGTCAGTCGATAGCT',
		      'CGCAGTCGTAGTCGTAGTC',
		      'ACCAGGGGGGGGGGTAGTC',
		      'AGCAATCGTAGTCGTAGTC',
		      'AGCCACACGTTCTCTCGTC',
		      'AGCAATCGTAGTCGTAGTC',
		      'AGCAGTGGGGTTTTTAGTC',
		      'AGCAGTCGTAAACGAAAAC',
		      'ACCCCCCCCCCCCAACCTC',
		      'AGCAGGAAGGGGGGGAGTC']
label_testdna2 = numpy.array(5*[-1.0] + 5*[1.0])

parameter_list = [[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
                       testdna2,label_testdna2,1,3],[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \
                       testdna2,label_testdna2,2,5]]

def classifier_domainadaptationsvm_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna, \
                                               label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \
                                               label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3):




	feats_train = StringCharFeatures(fm_train_dna, DNA)
	feats_test = StringCharFeatures(fm_test_dna, DNA)
	kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels = BinaryLabels(label_train_dna)
	svm = SVMLight(C, kernel, labels)
	svm.train()
	#svm.io.set_loglevel(MSG_DEBUG)

	#####################################

	#print("obtaining DA SVM from previously trained SVM")

	feats_train2 = StringCharFeatures(fm_train_dna, DNA)
	feats_test2 = StringCharFeatures(fm_test_dna, DNA)
	kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree)
	labels2 = BinaryLabels(label_train_dna)

	# we regularize against the previously obtained solution
	dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0)
	dasvm.train()

	out = dasvm.apply_binary(feats_test2)

	return out #,dasvm TODO

if __name__=='__main__':
	print('SVMLight')
	classifier_domainadaptationsvm_modular(*parameter_list[0])

examples/documented/python_modular/classifier_featureblock_logistic_regression.py

#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):

	from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
	try:
		from modshogun import FeatureBlockLogisticRegression
	except ImportError:
		print("FeatureBlockLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_features = features.get_num_features()
	block_one = IndexBlock(0,n_features//2)
	block_two = IndexBlock(n_features//2,n_features)
	block_group = IndexBlockGroup()
	block_group.add_block(block_one)
	block_group.add_block(block_two)

	mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group)
	mtlr.set_regularization(1) # use regularization ratio
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.train()
	out = mtlr.apply().get_labels()

	return out

if __name__=='__main__':
	print('FeatureBlockLogisticRegression')
	classifier_featureblock_logistic_regression(*parameter_list[0])

examples/documented/python_modular/classifier_gmnpsvm_modular.py

# In this example a multi-class support vector machine is trained on a toy data
# set and the trained classifier is then used to predict labels of test
# examples. The training algorithm is based on BSVM formulation (L2-soft margin
# and the bias added to the objective function) which is solved by the Improved
# Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian
# kernel of width 2.1 and the regularization constant C=1. The solver stops if the
# relative duality gap falls below 1e-5.
# 
# For more details on the used SVM solver see
#  V.Franc: Optimization Algorithms for Kernel Methods. Research report.
#  CTU-CMP-2005-22. CTU FEL Prague. 2005.
#  ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf .
# 

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]

def classifier_gmnpsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel, GMNPSVM, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=MulticlassLabels(CSVFile(label_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=GMNPSVM(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.train(feats_train)

	out=svm.apply(feats_test).get_labels()
	return out,kernel
if __name__=='__main__':
	print('GMNPSVM')
	classifier_gmnpsvm_modular(*parameter_list[0])

examples/documented/python_modular/classifier_gpbtsvm_modular.py

# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm Gradient Projection Decomposition Technique
# (GPDT) is used with SVM regularization parameter C=1 and a Gaussian
# kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution.
# 
# For more details on GPDT solver see http://dm.unife.it/gpdt .
# 

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]

def classifier_gpbtsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, BinaryLabels
	from modshogun import GaussianKernel
	from modshogun import CSVFile
	try:
		from modshogun import GPBTSVM
	except ImportError:
		print("GPBTSVM not available")
		exit(0)

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))
	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=GPBTSVM(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.train()

	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()


if __name__=='__main__':
	print('GPBTSVM')
	classifier_gpbtsvm_modular(*parameter_list[0])

examples/documented/python_modular/classifier_larank_modular.py

# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is then used to predict labels of test
# examples. As training algorithm the LaRank algorithm is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision
# set to epsilon=1e-5.
# 
# For more details on LaRank see
#    Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J.
#    Solving MultiClass Support Vector Machines with LaRank. ICML 2007.
# 

#!/usr/bin/env python
from numpy import *
parameter_list = [[10,3,15,0.9,1,2000,1],[20,4,15,0.9,1,5000,2]]

def classifier_larank_modular (num_vec,num_class,distance,C=0.9,num_threads=1,num_iter=5,seed=1):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel
	from modshogun import LaRank
	from modshogun import Math_init_random

	# reproducible results
	Math_init_random(seed)
	random.seed(seed)

	# generate some training data where each class pair is linearly separable
	label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	fm_train=array(random.randn(num_class,num_vec))
	fm_test=array(random.randn(num_class,num_vec))
	for i in range(len(label_train)):
		fm_train[label_train[i],i]+=distance
		fm_test[label_test[i],i]+=distance

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	width=2.1
	kernel=GaussianKernel(feats_train, feats_train, width)

	epsilon=1e-5
	labels=MulticlassLabels(label_train)

	svm=LaRank(C, kernel, labels)
	#svm.set_tau(1e-3)
	svm.set_batch_mode(False)
	#svm.io.enable_progress()
	svm.set_epsilon(epsilon)
	svm.train()
	out=svm.apply(feats_test).get_labels()
	predictions = svm.apply()
	return predictions, svm, predictions.get_labels()


if __name__=='__main__':
	print('LaRank')
	[predictions, svm, labels] = classifier_larank_modular(*parameter_list[0])

examples/documented/python_modular/classifier_lda_modular.py

# In this example a two-class linear classifier based on the Linear Discriminant
# Analysis (LDA) is trained on a toy data set and then the trained classifier is
# used to predict test examples. The regularization parameter, which corresponds
# to a weight of a unitary matrix added to the covariance matrix, is set to
# gamma=3.
# 
# For more details on the LDA see e.g.
#     http://en.wikipedia.org/wiki/Linear_discriminant_analysis

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,3,1],[traindat,testdat,label_traindat,4,1]]

def classifier_lda_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,gamma=3,num_threads=1):
	from modshogun import RealFeatures, BinaryLabels, LDA, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	lda=LDA(gamma, feats_train, labels)
	lda.train()

	bias=lda.get_bias()
	w=lda.get_w()
	predictions = lda.apply(feats_test).get_labels()
	return lda,predictions

if __name__=='__main__':
	print('LDA')
	classifier_lda_modular(*parameter_list[0])

examples/documented/python_modular/classifier_libsvmoneclass_modular.py

# In this example a one-class support vector machine classifier is trained on a
# toy data set. The training algorithm finds a hyperplane in the RKHS which
# separates the training data from the origin. The one-class classifier is
# typically used to estimate the support of a high-dimesnional distribution.
# For more details see e.g.
#   B. Schoelkopf et al. Estimating the support of a high-dimensional
#   distribution. Neural Computation, 13, 2001, 1443-1471.
# 
# In the example, the one-class SVM is trained by the LIBSVM solver with the
# regularization parameter C=1 and the Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5.
# 
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat,2.2,1,1e-7],[traindat,testdat,2.1,1,1e-5]]

def classifier_libsvmoneclass_modular (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=LibSVMOneClass(C, kernel)
	svm.set_epsilon(epsilon)
	svm.train()

	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('LibSVMOneClass')
	classifier_libsvmoneclass_modular(*parameter_list[0])

examples/documented/python_modular/classifier_mpdsvm_modular.py

# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Minimal Primal Dual SVM is used with SVM
# regularization parameter C=1 and a Gaussian kernel of width 1.2 and the
# precision parameter 1e-5.
# 
# For more details on the MPD solver see
#  Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple
#  Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell,
#  J. G., J. Siekmann, Springer, Berlin, Germany (11 2005)

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,1,1e-5],[traindat,testdat,label_traindat,0.9,1e-5]]

def classifier_mpdsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=1,epsilon=1e-5):

	from modshogun import RealFeatures, BinaryLabels
	from modshogun import GaussianKernel
	from modshogun import MPDSVM, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))
	width=2.1
	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=MPDSVM(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.train()

	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('MPDSVM')
	classifier_mpdsvm_modular(*parameter_list[0])

examples/documented/python_modular/classifier_multiclass_ecoc.py

#!/usr/bin/env python
import re
import time
from tools.multiclass_shared import prepare_data

# run with toy data
[traindat, label_traindat, testdat, label_testdat] = prepare_data()
# run with opt-digits if available
#[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)

parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5]]

def classifier_multiclass_ecoc (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,lawidth=2.1,C=1,epsilon=1e-5):

	import modshogun
	from modshogun import ECOCStrategy, LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
	from modshogun import MulticlassAccuracy
	from modshogun import RealFeatures, MulticlassLabels

	def nonabstract_class(name):
		try:
		    getattr(modshogun, name)()
		except TypeError:
		    return False
		return True

	encoders = [x for x in dir(modshogun)
		    if re.match(r'ECOC.+Encoder', x) and nonabstract_class(x)]
	decoders = [x for x in dir(modshogun)
		    if re.match(r'ECOC.+Decoder', x) and nonabstract_class(x)]

	fea_train = RealFeatures(fm_train_real)
	fea_test  = RealFeatures(fm_test_real)
	gnd_train = MulticlassLabels(label_train_multiclass)
	if label_test_multiclass is None:
		gnd_test = None
	else:
		gnd_test = MulticlassLabels(label_test_multiclass)

	base_classifier = LibLinear(L2R_L2LOSS_SVC)
	base_classifier.set_bias_enabled(True)

	#print('Testing with %d encoders and %d decoders' % (len(encoders), len(decoders)))
	#print('-' * 70)
	#format_str = '%%15s + %%-10s  %%-10%s %%-10%s %%-10%s'
	#print((format_str % ('s', 's', 's')) % ('encoder', 'decoder', 'codelen', 'time', 'accuracy'))

	def run_ecoc(ier, idr):
		encoder = getattr(modshogun, encoders[ier])()
		decoder = getattr(modshogun, decoders[idr])()

		# whether encoder is data dependent
		if hasattr(encoder, 'set_labels'):
		    encoder.set_labels(gnd_train)
		    encoder.set_features(fea_train)

		strategy = ECOCStrategy(encoder, decoder)
		classifier = LinearMulticlassMachine(strategy, fea_train, base_classifier, gnd_train)
		classifier.train()
		label_pred = classifier.apply(fea_test)
		if gnd_test is not None:
		    evaluator = MulticlassAccuracy()
		    acc = evaluator.evaluate(label_pred, gnd_test)
		else:
		    acc = None

		return (classifier.get_num_machines(), acc)


	for ier in range(len(encoders)):
		for idr in range(len(decoders)):
		    t_begin = time.clock()
		    (codelen, acc) = run_ecoc(ier, idr)
		    if acc is None:
		        acc_fmt = 's'
		        acc = 'N/A'
		    else:
		        acc_fmt = '.4f'

		    t_elapse = time.clock() - t_begin
		    #print((format_str % ('d', '.3f', acc_fmt)) %
		    #        (encoders[ier][4:-7], decoders[idr][4:-7], codelen, t_elapse, acc))

if __name__=='__main__':
    print('MulticlassECOC')
    classifier_multiclass_ecoc(*parameter_list[0])

examples/documented/python_modular/classifier_multiclassliblinear_modular.py

#!/usr/bin/env python
from tools.multiclass_shared import prepare_data

[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)

parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]]

def classifier_multiclassliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import MulticlassLibLinear

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = MulticlassLibLinear(C,feats_train,labels)
	classifier.train()

	label_pred = classifier.apply(feats_test)
	out = label_pred.get_labels()

	if label_test_multiclass is not None:
		from modshogun import MulticlassAccuracy
		labels_test = MulticlassLabels(label_test_multiclass)
		evaluator = MulticlassAccuracy()
		acc = evaluator.evaluate(label_pred, labels_test)
		print('Accuracy = %.4f' % acc)

	return out

if __name__=='__main__':
	print('MulticlassLibLinear')
	classifier_multiclassliblinear_modular(*parameter_list[0])

examples/documented/python_modular/classifier_multiclassmachine_modular.py

#!/usr/bin/env python
from tools.multiclass_shared import prepare_data

[traindat, label_traindat, testdat, label_testdat] = prepare_data()

parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]

def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel
	from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = LibSVM()
	classifier.set_epsilon(epsilon)
	#print labels.get_labels()
	mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels)
	mc_classifier.train()

	kernel.init(feats_train, feats_test)
	out = mc_classifier.apply().get_labels()
	return out

if __name__=='__main__':
	print('MulticlassMachine')
	classifier_multiclassmachine_modular(*parameter_list[0])

examples/documented/python_modular/classifier_multiclassocas_modular.py

#!/usr/bin/env python
from numpy import *
parameter_list = [[10,3,15,2.1,1,1e-5,1],[20,4,15,2.2,2,1e-5,2]]

def classifier_multiclassocas_modular (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import Math_init_random
	try:
		from modshogun import MulticlassOCAS
	except ImportError:
		print("MulticlassOCAS not available")
		return

	# reproducible results
	random.seed(seed)
	Math_init_random(seed)

	# generate some training data where each class pair is linearly separable
	label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	fm_train=array(random.randn(num_class,num_vec))
	fm_test=array(random.randn(num_class,num_vec))
	for i in range(len(label_train)):
		fm_train[label_train[i],i]+=distance
		fm_test[label_test[i],i]+=distance

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	labels=MulticlassLabels(label_train)

	classifier = MulticlassOCAS(C,feats_train,labels)
	classifier.train()

	out = classifier.apply(feats_test).get_labels()
	#print label_test
	#print out
	return out,classifier

if __name__=='__main__':
	print('MulticlassOCAS')
	classifier_multiclassocas_modular(*parameter_list[0])

examples/documented/python_modular/classifier_multilabeloutputliblinear_modular.py

#!/usr/bin/env python
from tools.multiclass_shared import prepare_data

[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)

parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]]

def classifier_multilabeloutputliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
	from modshogun import RealFeatures, MulticlassLabels, MultilabelLabels
	from modshogun import MulticlassLibLinear

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = MulticlassLibLinear(C,feats_train,labels)
	classifier.train()

	label_pred = classifier.apply_multilabel_output(feats_test,2)
	out = label_pred.get_labels()
	#print out
	return out

if __name__=='__main__':
	print('MultilabelOutputLibLinear')
	classifier_multilabeloutputliblinear_modular(*parameter_list[0])

examples/documented/python_modular/classifier_perceptron_modular.py

# This example shows usage of the Perceptron algorithm for training a two-class
# linear classifier, i.e.  y = sign( <x,w>+b). The Perceptron algorithm works by
# iteratively passing though the training examples and applying the update rule on
# those examples which are misclassified by the current classifier. The Perceptron
# update rule reads
# 
#   w(t+1) = w(t) + alpha * y_t * x_t
#   b(t+1) = b(t) + alpha * y_t
# 
# where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example
#       (w(t),b(t)) are the current parameters of the linear classifier
#       (w(t+1),b(t+1)) are the new parameters of the linear classifier
#       alpha is the learning rate; in this examples alpha=1
# 
# The Perceptron algorithm iterates until all training examples are correctly
# classified or the prescribed maximal number of iterations, in this example
# max_iter=1000, is reached.

#!/usr/bin/env python

from numpy import *

parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]]

def classifier_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1):
	from modshogun import RealFeatures, BinaryLabels
	from modshogun import Perceptron

	random.seed(seed)

	# produce some (probably) linearly separable training data by hand
	# Two Gaussians at a far enough distance
	X=array(random.randn(dim,n))+distance
	Y=array(random.randn(dim,n))-distance
	X_test=array(random.randn(dim,n))+distance
	Y_test=array(random.randn(dim,n))-distance
	label_train_twoclass=hstack((ones(n), -ones(n)))

	#plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o')
	fm_train_real=hstack((X,Y))
	fm_test_real=hstack((X_test,Y_test))

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=BinaryLabels(label_train_twoclass)

	perceptron=Perceptron(feats_train, labels)
	perceptron.set_learn_rate(learn_rate)
	perceptron.set_max_iter(max_iter)
	# only guaranteed to converge for separable data
	perceptron.train()

	perceptron.set_features(feats_test)
	out_labels = perceptron.apply().get_labels()
	return perceptron, out_labels

if __name__=='__main__':
	print('Perceptron')
	classifier_perceptron_modular(*parameter_list[0])

examples/documented/python_modular/classifier_ssk_modular.py

#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2014 Soumyajit De
#

#!/usr/bin/env python

from tools.load import LoadMatrix

lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')

parameter_list = [[traindat,testdat,label_traindat,1,5,0.9]]

def classifier_ssk_modular (fm_train_dna=traindat,fm_test_dna=testdat,
		label_train_dna=label_traindat,C=1,maxlen=1,decay=1):
	from modshogun import StringCharFeatures, BinaryLabels
	from modshogun import LibSVM, SubsequenceStringKernel, DNA
	from modshogun import ErrorRateMeasure

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)
	labels=BinaryLabels(label_train_dna)
	kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay);

	svm=LibSVM(C, kernel, labels);
	svm.train();

	out=svm.apply(feats_train);
	evaluator = ErrorRateMeasure()
	trainerr = evaluator.evaluate(out,labels)
	# print(trainerr)

	kernel.init(feats_train, feats_test)
	predicted_labels=svm.apply(feats_test).get_labels()
	# print predicted_labels

	return predicted_labels

if __name__=='__main__':
	print('SringSubsequenceKernel classification DNA')
	classifier_ssk_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmlight_batch_linadd_modular.py

# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and
# a precision parameter epsilon=1e-5. The LINADD trick is used to speed up
# training.
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
# 
# For more details on the Weighted Degree kernel and the LINADD trick see
#   Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String
#   Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston,
#   Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press,
#   Cambridge, MA. 2007.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

train_dna=lm.load_dna('../data/fm_train_dna.dat')
test_dna=lm.load_dna('../data/fm_test_dna.dat')
label=lm.load_labels('../data/label_train_dna.dat')

parameter_list=[[train_dna, test_dna, label, 20, 0.9, 1e-7, 1],
		[train_dna, test_dna, label, 20, 2.3, 1e-7, 4]]

def classifier_svmlight_batch_linadd_modular (fm_train_dna, fm_test_dna,
		label_train_dna, degree, C, epsilon, num_threads):

	from modshogun import StringCharFeatures, BinaryLabels, DNA
	from modshogun import WeightedDegreeStringKernel, MSG_DEBUG
	try:
		from modshogun import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)

	#print('SVMLight Objective: %f num_sv: %d' % \)
	#	(svm.get_objective(), svm.get_num_support_vectors())
	svm.set_batch_computation_enabled(False)
	svm.set_linadd_enabled(False)
	svm.apply().get_labels()

	svm.set_batch_computation_enabled(True)
	labels = svm.apply().get_labels()
	return labels, svm


if __name__=='__main__':
	print('SVMlight batch')
	classifier_svmlight_batch_linadd_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmlight_linear_term_modular.py

# This example demonstrates how to train an SVMLight classifier
# using a custom linear term. This is used in the class DASVM that
# pre-computes this linear term using a previously trained SVM.
# 

#!/usr/bin/env python
import numpy

traindna=['CGCACGTACGTAGCTCGAT',
		      'CGACGTAGTCGTAGTCGTA',
		      'CGACGGGGGGGGGGTCGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGACCACAGTTATATAGTA',
		      'CGACGTAGTCGTAGTCGTA',
		      'CGACGTAGTTTTTTTCGTA',
		      'CGACGTAGTCGTAGCCCCA',
		      'CAAAAAAAAAAAAAAAATA',
		      'CGACGGGGGGGGGGGCGTA']
label_traindna=numpy.array(5*[-1.0] + 5*[1.0])
testdna=['AGCACGTACGTAGCTCGAT',
		      'AGACGTAGTCGTAGTCGTA',
		      'CAACGGGGGGGGGGTCGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGAACACAGTTATATAGTA',
		      'CGACCTAGTCGTAGTCGTA',
		      'CGACGTGGGGTTTTTCGTA',
		      'CGACGTAGTCCCAGCCCCA',
		      'CAAAAAAAAAAAACCAATA',
		      'CGACGGCCGGGGGGGCGTA']
label_test_dna=numpy.array(5*[-1.0] + 5*[1.0])

parameter_list = [[traindna,testdna,label_traindna,3,10,1e-5,1],[traindna,testdna,label_traindna,3,10,1e-5,1]]

def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \
                                                label_train_dna=label_traindna,degree=3, \
                                                C=10,epsilon=1e-5,num_threads=1):

    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel
    try:
    	from modshogun import SVMLight
    except ImportError:
    	print("SVMLight is not available")
    	exit(0)

    feats_train=StringCharFeatures(DNA)
    feats_train.set_features(fm_train_dna)
    feats_test=StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)

    kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

    labels=BinaryLabels(label_train_dna)

    svm=SVMLight(C, kernel, labels)
    svm.set_qpsize(3)
    svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double));
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train()

    kernel.init(feats_train, feats_test)
    out = svm.apply().get_labels()
    return out,kernel

if __name__=='__main__':
    print('SVMLight')
    classifier_svmlight_linear_term_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmlight_modular.py

# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
# 
# For more details on the Weighted Degree kernel see
#  G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
#  spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')

parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]]

def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from modshogun import StringCharFeatures, BinaryLabels, DNA
	from modshogun import WeightedDegreeStringKernel
	try:
		from modshogun import SVMLight
	except ImportError:
		print('No support for SVMLight available.')
		return

	feats_train=StringCharFeatures(DNA)
	feats_train.set_features(fm_train_dna)
	feats_test=StringCharFeatures(DNA)
	feats_test.set_features(fm_test_dna)
	degree=20

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	labels=BinaryLabels(label_train_dna)

	svm=SVMLight(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.train()

	kernel.init(feats_train, feats_test)
	svm.apply().get_labels()
	return kernel
if __name__=='__main__':
	print('SVMLight')
	classifier_svmlight_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmlin_modular.py

# In this example a two-class linear support vector machine classifier (SVM) is
# trained on a toy data set and the trained classifier is used to predict labels
# of test examples. As training algorithm the SVMLIN solver is used with the SVM
# regularization parameter C=0.9 and the bias in the classification rule switched
# on and the precision parameter epsilon=1e-5. The example also shows how to
# retrieve parameters (vector w and bias b)) of the trained linear classifier.
# 
# For more details on the SVMLIN solver see
#  V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised
#  Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]

def classifier_svmlin_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
	from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
	from modshogun import SVMLin, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMLin(C, feats_train, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.set_bias_enabled(True)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('SVMLin')
	classifier_svmlin_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmocas_modular.py

# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the OCAS solver is used with the SVM
# regularization parameter C=0.9 and the bias term in the classification rule
# switched off and the precision parameter epsilon=1e-5 (duality gap).
# 
# For more details on the OCAS solver see
#  V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk
#  Minimization.The Journal of Machine Learning Research, vol. 10,
#  pp. 2157--2192. October 2009.
# 

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]]

def classifier_svmocas_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1):
	from modshogun import RealFeatures, BinaryLabels
	from modshogun import CSVFile
	try:
		from modshogun import SVMOcas
	except ImportError:
		print("SVMOcas not available")
		return

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMOcas(C, feats_train, labels)
	svm.set_epsilon(epsilon)
	svm.parallel.set_num_threads(num_threads)
	svm.set_bias_enabled(False)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('SVMOcas')
	classifier_svmocas_modular(*parameter_list[0])

examples/documented/python_modular/classifier_svmsgd_modular.py

# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is
# used with the SVM regularization parameter C=0.9. The number of iterations, i.e.
# passes though all training examples, is set to num_iter=5 .
# 
# For more details on the SGD solver see
#  L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT
#  Press. 2008.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]]

def classifier_svmsgd_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5):
	from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
	from modshogun import SVMSGD, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMSGD(C, feats_train, labels)
	svm.set_epochs(num_iter)
	#svm.io.set_loglevel(0)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('SVMSGD')
	classifier_svmsgd_modular(*parameter_list[0])

Converter

examples/documented/python_modular/converter_diffusionmaps_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,10],[data,20]]

def converter_diffusionmaps_modular (data_fname,t):
	try:
		from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile

		features = RealFeatures(CSVFile(data_fname))

		converter = DiffusionMaps()
		converter.set_target_dim(1)
		converter.set_kernel(GaussianKernel(10,10.0))
		converter.set_t(t)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('DiffusionMaps')
	converter_diffusionmaps_modular(*parameter_list[0])

examples/documented/python_modular/converter_factoranalysis_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]

def converter_factoranalysis_modular(data_fname):
	try:
		import numpy
		from modshogun import RealFeatures, FactorAnalysis, EuclideanDistance, CSVFile

		features = RealFeatures(CSVFile(data_fname))

		converter = FactorAnalysis()
		converter.set_target_dim(2)
		embedding = converter.apply(features)

		X = embedding.get_feature_matrix()
		covdet = numpy.linalg.det(numpy.dot(X,X.T))

		return covdet > 0
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('Factor Analysis')
	converter_factoranalysis_modular(*parameter_list[0])

examples/documented/python_modular/converter_hasheddoc_modular.py

#!/usr/bin/env python

strings=['example document 1','example document 2','example document 3','example document 4']

parameter_list=[[strings]]

def converter_hasheddoc_modular(strings):
	from modshogun import SparseRealFeatures, RAWBYTE, StringCharFeatures, Features, HashedDocDotFeatures
	from modshogun import NGramTokenizer
	from modshogun import HashedDocConverter
	from numpy import array

	#create string features
	f=StringCharFeatures(strings, RAWBYTE)

	#set the number of bits of the target dimension
	#means a dim of size 2^5=32
	num_bits=5

	#create the ngram tokenizer of size 8 to parse the strings
	tokenizer=NGramTokenizer(8)

	#normalize results
	normalize=True

	#create converter
	converter=HashedDocConverter(tokenizer, num_bits, normalize)

	converted_feats=converter.apply(f)

	#should expect 32
	#print('Converted features\' space dimensionality is', converted_feats.get_dim_feature_space())

	#print('Self dot product of string 0 with converted feats:', converted_feats.dot(0, converted_feats, 0))

	hashed_feats=HashedDocDotFeatures(num_bits, f, tokenizer, normalize)

	#print('Hashed features\' space dimensionality is', hashed_feats.get_dim_feature_space())

	#print('Self dot product of string 0 with hashed feats:', hashed_feats.dot(0, hashed_feats, 0))

	return converted_feats

if __name__=='__main__':
	print('HashedDocConverter')
	converter_hasheddoc_modular(*parameter_list[0])

examples/documented/python_modular/converter_hessianlocallylinearembedding_modular.py

# In this example toy data is being preprocessed using the Hessian Locally Linear Embedding algorithm
# as described in
# 
# Donoho, D., & Grimes, C. (2003).
# Hessian eigenmaps: new tools for nonlinear dimensionality reduction.
# Proceedings of National Academy of Science (Vol. 100, pp. 5591-5596).

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_hessianlocallylinearembedding_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, CSVFile
		try:
			from modshogun import HessianLocallyLinearEmbedding
		except ImportError:
			print("HessianLocallyLinearEmbedding not available")
			exit(0)

		features = RealFeatures(CSVFile(data))

		converter = HessianLocallyLinearEmbedding()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('HessianLocallyLinearEmbedding')
	converter_hessianlocallylinearembedding_modular(*parameter_list[0])

examples/documented/python_modular/converter_isomap_modular.py

# In this example toy data is being processed using the Isomap algorithm
# as described in
# 
# Silva, V. D., & Tenenbaum, J. B. (2003).
# Global versus local methods in nonlinear dimensionality reduction.
# Advances in Neural Information Processing Systems 15, 15(Figure 2), 721-728. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.3407&rep=rep1&type=pdf
# 
# Before applying to the data the landmark approximation is enabled with
# specified number of landmarks. The landmark approximation is described in
# 
# Sparse multidimensional scaling using landmark points
# V De Silva, J B Tenenbaum (2004) Technology, p. 1-4
# 
# After enabling the landmark approximation k parameter -- the number
# of neighbors in the k nearest neighbor graph -- is initialized.

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]

def converter_isomap_modular (data_fname):
	from modshogun import RealFeatures, CSVFile
	from modshogun import Isomap
		
	features = RealFeatures(CSVFile(data))

	converter = Isomap()
	converter.set_k(20)
	converter.set_target_dim(1)
	converter.apply(features)

	return features

if __name__=='__main__':
	print('Isomap')
	#converter_isomap_modular(*parameter_list[0])

examples/documented/python_modular/converter_kernellocallylinearembedding_modular.py

# In this example toy data is being processed using kernel extension
# of the Locally Linear Embedding (LLE) algorithm as described in
# 
# Kayo, O. (2006). Locally linear embedding algorithm. Extensions and applications. October.
# Retrieved from: http://herkules.oulu.fi/isbn9514280415/isbn9514280415.pd
# 
# Linear kernel is used as kernel of the extension.

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_kernellocallylinearembedding_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, LinearKernel, CSVFile
		try:
			from modshogun import KernelLocallyLinearEmbedding
		except ImportError:
			print("KernelLocallyLinearEmbedding not available")
			exit(0)
			
		features = RealFeatures(CSVFile(data_fname))

		kernel = LinearKernel()

		converter = KernelLocallyLinearEmbedding(kernel)
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('KernelLocallyLinearEmbedding')
	converter_kernellocallylinearembedding_modular(*parameter_list[0])

examples/documented/python_modular/converter_laplacianeigenmaps_modular.py

# In this example toy data is being processed using Laplacian Eigenmaps
# algorithm as described in
# 
# Belkin, M., & Niyogi, P. (2002).
# Laplacian Eigenmaps and Spectral Techniques for Embedding and Clustering.
# Science, 14, 585-591. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.9400&rep=rep1&type=pdf
# 
# The number of neighbors for the kNN graph and the heat distribution
# coeffcient is set before processing the data

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_laplacianeigenmaps_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, CSVFile
		try:
			from modshogun import LaplacianEigenmaps
		except ImportError:
			print("LaplacianEigenmaps not available")
			exit(0)
			
		features = RealFeatures(CSVFile(data_fname))

		converter = LaplacianEigenmaps()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.set_tau(20.0)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('LaplacianEigenmaps')
	converter_laplacianeigenmaps_modular(*parameter_list[0])

examples/documented/python_modular/converter_linearlocaltangentspacealignment_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_linearlocaltangentspacealignment_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, CSVFile
		try:
			from modshogun import LinearLocalTangentSpaceAlignment
		except ImportError:
			print("LinearLocalTangentSpaceAlignment not available")
			exit(0)
			
		features = RealFeatures(CSVFile(data_fname))

		converter = LinearLocalTangentSpaceAlignment()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('LinearLocalTangentSpaceAlignment')
	converter_linearlocaltangentspacealignment_modular(*parameter_list[0])

examples/documented/python_modular/converter_localitypreservingprojections_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_localitypreservingprojections_modular (data_fname,k):
	from modshogun import RealFeatures, CSVFile
	from modshogun import LocalityPreservingProjections

	features = RealFeatures(CSVFile(data_fname))
	converter = LocalityPreservingProjections()
	converter.set_target_dim(1)
	converter.set_k(k)
	converter.set_tau(2.0)
	converter.apply(features)

	return features

if __name__=='__main__':
	print('LocalityPreservingProjections')
	#converter_localitypreservingprojections_modular(*parameter_list[0])

examples/documented/python_modular/converter_locallylinearembedding_modular.py

# In this example toy data is being preprocessed using the Locally Linear Embedding (LLE)
# algorithm as described in
# 
# Saul, L. K., Ave, P., Park, F., & Roweis, S. T. (2001).
# An Introduction to Locally Linear Embedding. Available from, 290(5500), 2323-2326.
# Retrieved from: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7319&rep=rep1&type=pdf
# 
# The number of neighbors used during the linear reconstruction step of the algorithm is set
# before processing of the data.

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_locallylinearembedding_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, CSVFile
		try:
			from modshogun import LocallyLinearEmbedding
		except ImportError:
			print("LocallyLinearEmbedding not available")
			exit(0)

		features = RealFeatures(CSVFile(data_fname))

		converter = LocallyLinearEmbedding()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('LocallyLinearEmbedding')
	converter_locallylinearembedding_modular(*parameter_list[0])

examples/documented/python_modular/converter_localtangentspacealignment_modular.py

# In this example toy data is being processed using the Local Tangent Space
# Alignment (LTSA) algorithms as described in
# 
# Zhang, Z., & Zha, H. (2002). Principal Manifolds
# and Nonlinear Dimension Reduction via Local Tangent Space Alignment.
# Journal of Shanghai University English Edition, 8(4), 406-424. SIAM.
# Retrieved from http://arxiv.org/abs/cs/0212008
# 
# Before processing the number of neighbors for computing local tangent space
# is set

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data,20],[data,30]]

def converter_localtangentspacealignment_modular (data_fname,k):
	try:
		from modshogun import RealFeatures, CSVFile
		try:
			from modshogun import LocalTangentSpaceAlignment
		except ImportError:
			print("LocalTangentSpaceAlignment not available")
			exit(0)
			
		features = RealFeatures(CSVFile(data_fname))

		converter = LocalTangentSpaceAlignment()
		converter.set_target_dim(1)
		converter.set_k(k)
		converter.apply(features)

		return features
	except ImportError:
		print('No Eigen3 available')


if __name__=='__main__':
	print('LocalTangentSpaceAlignment')
	converter_localtangentspacealignment_modular(*parameter_list[0])

examples/documented/python_modular/converter_multidimensionalscaling_modular.py

# In this example toy data is being processed using the multidimensional
# scaling as described on p.261 (Section 12.1) of
# 
# Borg, I., & Groenen, P. J. F. (2005).
# Modern multidimensional scaling: Theory and applications. Springer.
# 
# Before processing the landmark approximation is disabled.

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]

def converter_multidimensionalscaling_modular (data_fname):
	try:
		import numpy
		from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile

		features = RealFeatures(CSVFile(data_fname))

		distance_before = EuclideanDistance()
		distance_before.init(features,features)

		converter = MultidimensionalScaling()
		converter.set_target_dim(2)
		converter.set_landmark(False)
		embedding = converter.apply(features)

		distance_after = EuclideanDistance()
		distance_after.init(embedding,embedding)

		distance_matrix_after = distance_after.get_distance_matrix()
		distance_matrix_before = distance_before.get_distance_matrix()

		return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('MultidimensionalScaling')
	converter_multidimensionalscaling_modular(*parameter_list[0])

examples/documented/python_modular/converter_stochasticproximityembedding_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data, 20]]

def converter_stochasticproximityembedding_modular (data_fname, k):
	try:
		from modshogun import RealFeatures,StochasticProximityEmbedding, SPE_GLOBAL, SPE_LOCAL, CSVFile

		features = RealFeatures(CSVFile(data_fname))

		converter = StochasticProximityEmbedding()
		converter.set_target_dim(1)
		converter.set_nupdates(40)
		# Embed with local strategy
		converter.set_k(k)
		converter.set_strategy(SPE_LOCAL)
		converter.embed(features)
		# Embed with global strategy
		converter.set_strategy(SPE_GLOBAL)
		converter.embed(features)

		return features
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('StochasticProximityEmbedding')
	converter_stochasticproximityembedding_modular(*parameter_list[0])

examples/documented/python_modular/converter_tdistributedstochasticneighborembedding_modular.py

#!/usr/bin/env python
data = '../data/fm_train_real.dat'
parameter_list = [[data]]

def converter_tdistributedstochasticneighborembedding_modular(data_fname, seed=1):
	try:
		from modshogun import RealFeatures, TDistributedStochasticNeighborEmbedding
		from modshogun import Math_init_random, CSVFile

		# reproducible results
		Math_init_random(seed)
		features = RealFeatures(CSVFile(data_fname))

		converter = TDistributedStochasticNeighborEmbedding()
		converter.set_target_dim(2)

		embedding = converter.apply(features)

		return embedding
	except ImportError:
		print('No Eigen3 available')

if __name__=='__main__':
	print('TDistributedStochasticNeighborEmbedding')
	converter_tdistributedstochasticneighborembedding_modular(*parameter_list[0])

Distance

examples/documented/python_modular/distance_braycurtis_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
# 
# Obviously, using the Bray Curtis distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_braycurtis_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, BrayCurtisDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=BrayCurtisDistance(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('BrayCurtisDistance')
	distance_braycurtis_modular(*parameter_list[0])

examples/documented/python_modular/distance_canberra_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CCanberraMetric.html.
# 
# Obviously, using the Canberra distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_canberra_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, CanberraMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=CanberraMetric(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('CanberaMetric')
	distance_canberra_modular(*parameter_list[0])

examples/documented/python_modular/distance_canberraword_modular.py

# This example shows how to compute the Canberra Word Distance.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')


parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,3,0,False]]

def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import CanberraWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('CanberraWordDistance')
	distance_canberraword_modular(*parameter_list[0])

examples/documented/python_modular/distance_chebyshew_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (maximum of absolute feature dimension differences) matrix is
# computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# (maximum of absolute feature dimension differences) matrix between these
# two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
# 
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_chebyshew_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, ChebyshewMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=ChebyshewMetric(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('ChebyshewMetric')
	distance_chebyshew_modular(*parameter_list[0])

examples/documented/python_modular/distance_chisquare_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
# 
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat,],[traindat,testdat]]

def distance_chisquare_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, ChiSquareDistance, CSVFile
	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=ChiSquareDistance(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('ChiSquareDistance')
	distance_chisquare_modular(*parameter_list[0])

examples/documented/python_modular/distance_cosine_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CCosineDistance.html.
# 
# Obviously, using the Cosine distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_cosine_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, CosineDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=CosineDistance(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('CosineDistance')
	distance_cosine_modular(*parameter_list[0])

examples/documented/python_modular/distance_director_euclidean_modular.py

#!/usr/bin/env python
import numpy
from modshogun import RealFeatures, MSG_DEBUG

numpy.random.seed(17)
traindat = numpy.random.random_sample((10,10))
testdat = numpy.random.random_sample((10,10))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]

def distance_director_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
	try:
		from modshogun import DirectorDistance
	except ImportError:
		print("recompile shogun with --enable-swig-directors")
		return

	class DirectorEuclideanDistance(DirectorDistance):
		def __init__(self):
			DirectorDistance.__init__(self, True)
		def distance_function(self, idx_a, idx_b):
			seq1 = self.get_lhs().get_feature_vector(idx_a)
			seq2 = self.get_rhs().get_feature_vector(idx_b)
			return numpy.linalg.norm(seq1-seq2)

	from modshogun import EuclideanDistance
	from modshogun import Time

	feats_train=RealFeatures(fm_train_real)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_train.parallel.set_num_threads(1)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance()
	distance.init(feats_train, feats_test)

	ddistance=DirectorEuclideanDistance()
	ddistance.init(feats_train, feats_test)

	#print  "dm_train"
	t=Time()
	dm_train=distance.get_distance_matrix()
	#t1=t.cur_time_diff(True)

	#print  "ddm_train"
	t=Time()
	ddm_train=ddistance.get_distance_matrix()
	#t2=t.cur_time_diff(True)

	#print "dm_train", dm_train
	#print "ddm_train", ddm_train

	return dm_train, ddm_train

if __name__=='__main__':
	print('DirectorEuclideanDistance')
	distance_director_euclidean_modular(*parameter_list[0])

examples/documented/python_modular/distance_euclidean_modular.py

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,traindat],[traindat,testdat]]

def distance_euclidean_modular(train_fname=traindat,test_fname=testdat):

	from modshogun import RealFeatures, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('EuclideanDistance')
	distance_euclidean_modular(*parameter_list[0])

examples/documented/python_modular/distance_geodesic_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a
# pairwise distance (shortest path on a sphere) matrix is computed
# by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# (shortest path on a sphere) matrix between these two data sets is
# computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
# 
# Obviously, using the Geodesic distance is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_geodesic_modular (train_fname=traindat,test_fname=testdat):

	from modshogun import RealFeatures, GeodesicMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=GeodesicMetric(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test


if __name__=='__main__':
	print('GeodesicMetric')
	distance_geodesic_modular(*parameter_list[0])

examples/documented/python_modular/distance_hammingword_modular.py

# This example shows how to compute the Hamming Word Distance for string features.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_labels('../data/fm_test_real.dat')

parameter_list = [[traindna,testdna,testdat,4,0,False,False],
		[traindna,testdna,testdat,3,0,False,False]]

def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString
	from modshogun import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=HammingWordDistance(feats_train, feats_train, use_sign)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('HammingWordDistance')
	distance_hammingword_modular(*parameter_list[0])

examples/documented/python_modular/distance_jensen_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (divergence measure based on the Kullback-Leibler divergence) matrix
# is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# (divergence measure based on the Kullback-Leibler divergence) matrix between
# these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CJensenMetric.html.
# 
# Obviously, using the Jensen-Shannon distance/divergence is not limited to
# this showcase example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_jensen_modular (train_fname=traindat,test_fname=testdat):

	from modshogun import RealFeatures, JensenMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=JensenMetric(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test

if __name__=='__main__':
	print('JensenMetric')
	distance_jensen_modular(*parameter_list[0])

examples/documented/python_modular/distance_mahalanobis_modular.py

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat, testdat]]

def distance_mahalanobis_modular (train_fname = traindat, test_fname = testdat):

	from modshogun import RealFeatures, CSVFile
	from modshogun import MahalanobisDistance

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance = MahalanobisDistance(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('MahalanobisDistance')
	distance_mahalanobis_modular(*parameter_list[0])

examples/documented/python_modular/distance_manhatten_modular.py

# This example shows how to compute the Manhatten Distance.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_manhatten_modular (train_fname,test_fname=testdat):
	from modshogun import RealFeatures, ManhattanMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=ManhattanMetric(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('ManhattanMetric')
	distance_manhatten_modular(*parameter_list[0])

examples/documented/python_modular/distance_manhattenword_modular.py

# This example shows how to compute the Manahattan Distance for string features.

#!/usr/bin/env python
traindna = '../data/fm_train_dna.dat'
testdna = '../data/fm_test_dna.dat'

parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,4,0,False]]

def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString, ManhattanWordDistance, CSVFile

	charfeat=StringCharFeatures(CSVFile(train_fname), DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(CSVFile(test_fname), DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return dm_train,dm_test

if __name__=='__main__':
	print('ManhattanWordDistance')
	distance_manhattenword_modular(*parameter_list[0])

examples/documented/python_modular/distance_minkowski_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) and norm 'k' controls the processing of the given data points,
# where a pairwise distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
# 
# Obviously, using the Minkowski metric is not limited to this showcase
# example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat,3],[traindat,testdat,4]]

def distance_minkowski_modular (train_fname=traindat,test_fname=testdat,k=3):
	from modshogun import RealFeatures, MinkowskiMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=MinkowskiMetric(feats_train, feats_train, k)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('MinkowskiMetric')
	distance_minkowski_modular(*parameter_list[0])

examples/documented/python_modular/distance_normsquared_modular.py

# In this example an squared euclidian distance is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_normsquared_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	distance.set_disable_sqrt(True)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':

	print('EuclideanDistance - NormSquared')
	distance_normsquared_modular(*parameter_list[0])

examples/documented/python_modular/distance_sparseeuclidean_modular.py

# In this example a sparse euclidean distance is computed for sparse toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_sparseeuclidean_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, SparseRealFeatures, SparseEuclideanDistance, CSVFile

	realfeat=RealFeatures(CSVFile(train_fname))
	feats_train=SparseRealFeatures()
	feats_train.obtain_from_simple(realfeat)
	realfeat=RealFeatures(CSVFile(test_fname))
	feats_test=SparseRealFeatures()
	feats_test.obtain_from_simple(realfeat)

	distance=SparseEuclideanDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('SparseEuclideanDistance')
	distance_sparseeuclidean_modular(*parameter_list[0])

examples/documented/python_modular/distance_tanimoto_modular.py

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values from different
# files and initializes the matrices to 'RealFeatures'.
# Each column of the matrices corresponds to one data point.
# 
# The distance initialized by two data sets (the same data set as shown in the
# first call) controls the processing of the given data points, where a pairwise
# distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# The method call 'init'* binds the given data sets, where a pairwise distance
# (extended Jaccard coefficient) matrix between these two data sets is computed
# by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'.
# 
# *Note that the previous computed distance matrix can no longer be
# reaccessed by 'get_distance_matrix'.
# 
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
# 
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat],[traindat,testdat]]

def distance_tanimoto_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import RealFeatures, TanimotoDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=TanimotoDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test

if __name__=='__main__':
	print('TanimotoDistance')
	distance_tanimoto_modular(*parameter_list[0])

Distribution

examples/documented/python_modular/distribution_histogram_modular.py

# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')

parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]

def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False):
	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import Histogram

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	out_likelihood = histo.get_log_likelihood()
	out_sample = histo.get_log_likelihood_sample()
	return histo,out_sample,out_likelihood
###########################################################################
# call functions
###########################################################################

if __name__=='__main__':
	print('Histogram')
	distribution_histogram_modular(*parameter_list[0])

examples/documented/python_modular/distribution_hmm_modular.py

# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set. After calling the constructor of the HMM class specifying
# the number of states and transitions the model is trained. Via the Baum-Welch
# algorithm the optimal transition and emission probabilities are estimated. The
# best path, i.e. the path with highest probability given the model can then be
# calculated using get_best_path_state.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_cubes('../data/fm_train_cube.dat')

parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]]

def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from modshogun import StringWordFeatures, StringCharFeatures, CUBE
	from modshogun import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in range(num_examples):
		best_path+=hmm.best_path(i)
		for j in range(N):
			best_path_state+=hmm.get_best_path_state(i, j)

	lik_example = hmm.get_log_likelihood()
	lik_sample = hmm.get_log_likelihood_sample()

	return lik_example, lik_sample, hmm

###########################################################################
# call functions
###########################################################################

if __name__=='__main__':
	print('HMM')
	distribution_hmm_modular(*parameter_list[0])

examples/documented/python_modular/distribution_linearhmm_modular.py

# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to
# the structure of the Markov chain it is very similar to a HMM with just one
# chain of connected hidden states - that is why we termed this linear HMM.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')

parameter_list = [[traindna,3,0,False],[traindna,4,0,False]]

def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False):

	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import LinearHMM

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=LinearHMM(feats)
	hmm.train()

	hmm.get_transition_probs()

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in range(num_examples):
		for j in range(num_param):
			hmm.get_log_derivative(j, i)

	out_likelihood = hmm.get_log_likelihood()
	out_sample = hmm.get_log_likelihood_sample()

	return hmm,out_likelihood ,out_sample
###########################################################################
# call functions
###########################################################################

if __name__=='__main__':
	distribution_linearhmm_modular(*parameter_list[0])
	print('LinearHMM')

examples/documented/python_modular/distribution_ppwm_modular.py

# In this example usage of the Positional PWM is shown

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')

parameter_list = [[traindna,3],[traindna,4]]

def distribution_ppwm_modular (fm_dna=traindna, order=3):
	from modshogun import StringByteFeatures, StringCharFeatures, DNA
	from modshogun import PositionalPWM

	from numpy import array,e,log,exp

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringByteFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, 0, False)

	L=20
	k=3
	sigma = 1;
	mu = 4

	ppwm=PositionalPWM()
	ppwm.set_sigma(sigma)
	ppwm.set_mean(mu)
	pwm=array([[0.0, 0.5, 0.1, 1.0],
               [0.0, 0.5, 0.5, 0.0],
               [1.0, 0.0, 0.4, 0.0],
               [0.0, 0.0, 0.0, 0.0]]);
	pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]])



	ppwm.set_pwm(log(pwm))
	#print(ppwm.get_pwm())
	ppwm.compute_w(L)
	w=ppwm.get_w()
	#print(w)
	#from pylab import *
	#figure(1)
	#pcolor(exp(w))
	#pcolor(w)
	#colorbar()

	#figure(2)
	ppwm.compute_scoring(1)
	u=ppwm.get_scoring(0)
	#pcolor(exp(u))
	#show()

	#ppwm=PositionalPWM(feats)
	#ppwm.train()

	#out_likelihood = histo.get_log_likelihood()
	#out_sample = histo.get_log_likelihood_sample()
	return w,u
###########################################################################
# call functions
###########################################################################

if __name__=='__main__':
	print('PositionalPWM')
	distribution_ppwm_modular(*parameter_list[0])

Evaluation

examples/documented/python_modular/evaluation_clustering.py

# Example on how to evaluate the clustering performance (given ground-truth)

#!/usr/bin/env python
def get_dataset():
	from os.path import exists
	filename = "../../../data/uci/optdigits/optdigits.tes"
	if exists(filename):
		return open(filename)
	else:
		# print("Retrieving data...")
		try:
			from urllib2 import urlopen
		except ImportError:
			from urllib.request import urlopen
		return urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")

def prepare_data():
	from numpy import loadtxt
	stream = get_dataset()
	# print("Loading data...")
	data = loadtxt(stream, delimiter=',')
	fea = data[:, :-1]
	gnd = data[:, -1]
	return (fea.T, gnd)

(fea, gnd_raw) = prepare_data()
parameter_list = [[fea, gnd_raw, 10]]


def run_clustering(data, k):
	from modshogun import KMeans
	from modshogun import EuclideanDistance
	from modshogun import RealFeatures

	fea = RealFeatures(data)
	distance = EuclideanDistance(fea, fea)
	kmeans=KMeans(k, distance)

	# print("Running clustering...")
	kmeans.train()

	return kmeans.get_cluster_centers()

def assign_labels(data, centroids, ncenters):
	from modshogun import EuclideanDistance
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import KNN
	from numpy import arange

	labels = MulticlassLabels(arange(0.,ncenters))
	fea = RealFeatures(data)
	fea_centroids = RealFeatures(centroids)
	distance = EuclideanDistance(fea_centroids, fea_centroids)
	knn = KNN(1, distance, labels)
	knn.train()
	return knn.apply(fea)

def evaluation_clustering (features=fea, ground_truth=gnd_raw, ncenters=10):
	from modshogun import ClusteringAccuracy, ClusteringMutualInformation
	from modshogun import MulticlassLabels
	from modshogun import Math

	# reproducable results
	Math.init_random(1)

	centroids = run_clustering(features, ncenters)
	gnd_hat = assign_labels(features, centroids, ncenters)
	gnd = MulticlassLabels(ground_truth)

	AccuracyEval = ClusteringAccuracy()
	AccuracyEval.best_map(gnd_hat, gnd)

	accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
	#print(('Clustering accuracy = %.4f' % accuracy))

	MIEval = ClusteringMutualInformation()
	mutual_info = MIEval.evaluate(gnd_hat, gnd)
	#print(('Clustering mutual information = %.4f' % mutual_info))

	# TODO mutual information does not work with serialization
	#return gnd, gnd_hat, accuracy, MIEval, mutual_info
	return gnd, gnd_hat, accuracy

if __name__ == '__main__':
	print('Evaluation Clustering')
	evaluation_clustering(*parameter_list[0])

examples/documented/python_modular/evaluation_clustering_simple.py

#!/usr/bin/env python
parameter_list = [[1000,2,8],[1000,4,8]]

from numpy import *
#from pylab import *

def run_clustering(data, k):
	from modshogun import KMeans
	from modshogun import Math_init_random
	from modshogun import EuclideanDistance
	from modshogun import RealFeatures

	fea = RealFeatures(data)
	distance = EuclideanDistance(fea, fea)
	kmeans=KMeans(k, distance)

	#print("Running clustering...")
	kmeans.train()

	return kmeans.get_cluster_centers()

def assign_labels(data, centroids, ncenters):
	from modshogun import EuclideanDistance
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import KNN
	from numpy import arange

	labels = MulticlassLabels(arange(0.,ncenters))
	fea = RealFeatures(data)
	fea_centroids = RealFeatures(centroids)
	distance = EuclideanDistance(fea_centroids, fea_centroids)
	knn = KNN(1, distance, labels)
	knn.train()
	return knn.apply(fea)

def evaluation_clustering_simple (n_data=100, sqrt_num_blobs=4, distance=5):
	from modshogun import ClusteringAccuracy, ClusteringMutualInformation
	from modshogun import MulticlassLabels, GaussianBlobsDataGenerator
	from modshogun import Math

	# reproducable results
	Math.init_random(1)

	# produce sone Gaussian blobs to cluster
	ncenters=sqrt_num_blobs**2
	stretch=1
	angle=1
	gen=GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle)
	features=gen.get_streamed_features(n_data)
	X=features.get_feature_matrix()

	# compute approximate "ground truth" labels via taking the closest blob mean
	coords=array(range(0,sqrt_num_blobs*distance,distance))
	idx_0=[abs(coords -x).argmin() for x in X[0]]
	idx_1=[abs(coords -x).argmin() for x in X[1]]
	ground_truth=array([idx_0[i]*sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64")

	#for label in unique(ground_truth):
	#	indices=ground_truth==label
	#	plot(X[0][indices], X[1][indices], 'o')
	#show()

	centroids = run_clustering(features, ncenters)
	gnd_hat = assign_labels(features, centroids, ncenters)
	gnd = MulticlassLabels(ground_truth)

	AccuracyEval = ClusteringAccuracy()
	AccuracyEval.best_map(gnd_hat, gnd)

	accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
	# in this case we know that the clustering has to be very good
	#print(('Clustering accuracy = %.4f' % accuracy))
	assert(accuracy>0.8)

	MIEval = ClusteringMutualInformation()
	mutual_info = MIEval.evaluate(gnd_hat, gnd)
	#print(('Clustering mutual information = %.4f' % mutual_info))

	return gnd, accuracy, mutual_info

if __name__ == '__main__':
	print('Evaluation Clustering')
	evaluation_clustering_simple(*parameter_list[0])

examples/documented/python_modular/evaluation_contingencytableevaluation_modular.py

# In this example various (accuracy, error rate, ..) measures are being computed
# for the pair of ground truth toy data and random data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_contingencytableevaluation_modular (ground_truth, predicted):
	from modshogun import BinaryLabels
	from modshogun import ContingencyTableEvaluation
	from modshogun import AccuracyMeasure,ErrorRateMeasure,BALMeasure
	from modshogun import WRACCMeasure,F1Measure,CrossCorrelationMeasure
	from modshogun import RecallMeasure,PrecisionMeasure,SpecificityMeasure

	ground_truth_labels = BinaryLabels(ground_truth)
	predicted_labels = BinaryLabels(predicted)

	base_evaluator = ContingencyTableEvaluation()
	base_evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = AccuracyMeasure()
	accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = ErrorRateMeasure()
	errorrate = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = BALMeasure()
	bal = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = WRACCMeasure()
	wracc = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = F1Measure()
	f1 = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = CrossCorrelationMeasure()
	crosscorrelation = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = RecallMeasure()
	recall = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = PrecisionMeasure()
	precision = evaluator.evaluate(predicted_labels,ground_truth_labels)

	evaluator = SpecificityMeasure()
	specificity = evaluator.evaluate(predicted_labels,ground_truth_labels)

	return accuracy, errorrate, bal, wracc, f1, crosscorrelation, recall, precision, specificity


if __name__=='__main__':
	print('EvaluationContingencyTableEvaluation')
	evaluation_contingencytableevaluation_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_cross_validation_classification.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#

from numpy.random import randn
from numpy import *

# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
	randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));

parameter_list = [[traindat,label_traindat]]

def evaluation_cross_validation_classification (traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import ContingencyTableEvaluation, ACCURACY
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import BinaryLabels
    from modshogun import RealFeatures
    from modshogun import LibLinear, L2R_L2LOSS_SVC

    # training data
    features=RealFeatures(traindat)
    labels=BinaryLabels(label_traindat)

    # classifier
    classifier=LibLinear(L2R_L2LOSS_SVC)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "CrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation=CrossValidation(classifier, features, labels,
	    splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # (optional) repeat x-val 10 times
    cross_validation.set_num_runs(10)

    # perform cross-validation and print(results)
    result=cross_validation.evaluate()
    #print("mean:", result.mean)

if __name__=='__main__':
	print('Evaluation CrossValidationClassification')
	evaluation_cross_validation_classification(*parameter_list[0])

examples/documented/python_modular/evaluation_cross_validation_mkl_weight_storage.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#

from numpy.random import randn
from numpy import *

# generate some overlapping training vectors
num_vectors=5
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
	randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));

parameter_list = [[traindat,label_traindat]]

def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import CrossValidationPrintOutput
    from modshogun import CrossValidationMKLStorage
    from modshogun import ContingencyTableEvaluation, ACCURACY
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import BinaryLabels
    from modshogun import RealFeatures, CombinedFeatures
    from modshogun import GaussianKernel, CombinedKernel
    from modshogun import LibSVM, MKLClassification

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=BinaryLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLClassification(LibSVM());
    svm.set_interleaved_optimization_enabled(False);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    mkl_storage=CrossValidationMKLStorage()
    cross_validation.add_cross_validation_output(mkl_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result=cross_validation.evaluate()

    # print mkl weights
    weights=mkl_storage.get_mkl_weights()
    #print "mkl weights during cross--validation"
    #print weights

if __name__=='__main__':
	print('Evaluation CrossValidationClassification')
	evaluation_cross_validation_mkl_weight_storage(*parameter_list[0])

examples/documented/python_modular/evaluation_cross_validation_multiclass_storage.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#

from numpy.random import randn, seed
from numpy import *

# generate some overlapping training vectors
seed(1)
num_vectors=7
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
	randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((zeros(num_vectors), ones(num_vectors)));

parameter_list = [[traindat,label_traindat]]

def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import CrossValidationPrintOutput
    from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage
    from modshogun import MulticlassAccuracy, F1Measure
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import MulticlassLabels
    from modshogun import RealFeatures, CombinedFeatures
    from modshogun import GaussianKernel, CombinedKernel
    from modshogun import MKLMulticlass
    from modshogun import Statistics, MSG_DEBUG, Math

    Math.init_random(1)

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLMulticlass(1.0,kernel,labels);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 3)

    # evaluation method
    evaluation_criterium=MulticlassAccuracy()

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    #mkl_storage=CrossValidationMKLStorage()
    #cross_validation.add_cross_validation_output(mkl_storage)
    multiclass_storage=CrossValidationMulticlassStorage()
    multiclass_storage.append_binary_evaluation(F1Measure())
    cross_validation.add_cross_validation_output(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result=cross_validation.evaluate()

    roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0)
    #print roc_0_0_0
    auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0)
    #print auc_0_0_0
    return roc_0_0_0, auc_0_0_0


if __name__=='__main__':
	print('Evaluation CrossValidationMulticlassStorage')
	evaluation_cross_validation_multiclass_storage(*parameter_list[0])

examples/documented/python_modular/evaluation_cross_validation_regression.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list = [[traindat,label_traindat,0.8,1e-6],[traindat,label_traindat,0.9,1e-7]]

def evaluation_cross_validation_regression (train_fname=traindat,label_fname=label_traindat,width=0.8,tau=1e-6):
	from modshogun import CrossValidation, CrossValidationResult
	from modshogun import MeanSquaredError, CrossValidationSplitting
	from modshogun import RegressionLabels, RealFeatures
	from modshogun import GaussianKernel, KernelRidgeRegression, CSVFile

	# training data
	features=RealFeatures(CSVFile(train_fname))
	labels=RegressionLabels(CSVFile(label_fname))

	# kernel and predictor
	kernel=GaussianKernel()
	predictor=KernelRidgeRegression(tau, kernel, labels)

	# splitting strategy for 5 fold cross-validation (for classification its better
	# to use "StratifiedCrossValidation", but here, the std x-val is used
	splitting_strategy=CrossValidationSplitting(labels, 5)

	# evaluation method
	evaluation_criterium=MeanSquaredError()

	# cross-validation instance
	cross_validation=CrossValidation(predictor, features, labels,
			splitting_strategy, evaluation_criterium)

	# (optional) repeat x-val 10 times
	cross_validation.set_num_runs(10)

	# (optional) tell machine to precompute kernel matrix. speeds up. may not work
	predictor.data_lock(labels, features)

	# perform cross-validation and print(results)
	result=cross_validation.evaluate()
	#print("mean:", result.mean)

if __name__=='__main__':
	print('Evaluation CrossValidationClassification')
	evaluation_cross_validation_regression(*parameter_list[0])

examples/documented/python_modular/evaluation_director_contingencytableevaluation_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_director_contingencytableevaluation_modular (ground_truth, predicted):
	try:
		from modshogun import DirectorContingencyTableEvaluation, ED_MAXIMIZE
	except ImportError:
		print("recompile shogun with --enable-swig-directors")
		return

	class SimpleWeightedBinaryEvaluator(DirectorContingencyTableEvaluation):
		def __init__(self):
			DirectorContingencyTableEvaluation.__init__(self)
		def get_custom_direction(self):
			return ED_MAXIMIZE
		def get_custom_score(self):
			return self.get_WRACC()+self.get_BAL()

	from modshogun import BinaryLabels

	evaluator = SimpleWeightedBinaryEvaluator()
	r = evaluator.evaluate(BinaryLabels(ground_truth), BinaryLabels(predicted))
	r2 = evaluator.get_custom_score()
	print(r,r2)

	return r,r2

if __name__=='__main__':
	print('EvaluationDirectorContingencyTableEvaluation')
	evaluation_director_contingencytableevaluation_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_meansquarederror_modular.py

# In this example a mean squared error (MSE) is being computed
# for the pair of random vectors of length N.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

N = 100

random.seed(17)
ground_truth = random.randn(N)
predicted = random.randn(N)

parameter_list = [[ground_truth,predicted]]

def evaluation_meansquarederror_modular (ground_truth, predicted):
	from modshogun import RegressionLabels
	from modshogun import MeanSquaredError

	ground_truth_labels = RegressionLabels(ground_truth)
	predicted_labels = RegressionLabels(predicted)

	evaluator = MeanSquaredError()
	mse = evaluator.evaluate(predicted_labels,ground_truth_labels)

	return mse


if __name__=='__main__':
	print('MeanSquaredError')
	evaluation_meansquarederror_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_meansquaredlogerror_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

N = 100

random.seed(17)
ground_truth = abs(random.randn(N))
predicted = abs(random.randn(N))

parameter_list = [[ground_truth,predicted]]

def evaluation_meansquaredlogerror_modular (ground_truth, predicted):
	from modshogun import RegressionLabels
	from modshogun import MeanSquaredLogError

	ground_truth_labels = RegressionLabels(ground_truth)
	predicted_labels = RegressionLabels(predicted)

	evaluator = MeanSquaredLogError()
	mse = evaluator.evaluate(predicted_labels,ground_truth_labels)

	return mse


if __name__=='__main__':
	print('EvaluationMeanSquaredLogError')
	evaluation_meansquaredlogerror_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_multiclassaccuracy_modular.py

# In this example a multiclass accuracy is being computed for toy data labels
# and toy data labels multiplied by two.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

random.seed(17)
ground_truth = lm.load_labels('../data/label_train_multiclass.dat')
predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2

parameter_list = [[ground_truth,predicted]]

def evaluation_multiclassaccuracy_modular (ground_truth, predicted):
	from modshogun import MulticlassLabels
	from modshogun import MulticlassAccuracy

	ground_truth_labels = MulticlassLabels(ground_truth)
	predicted_labels = MulticlassLabels(predicted)

	evaluator = MulticlassAccuracy()
	accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)

	return accuracy


if __name__=='__main__':
	print('MulticlassAccuracy')
	evaluation_multiclassaccuracy_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_multiclassovrevaluation_modular.py

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

parameter_list = [[traindat, label_traindat]]

def evaluation_multiclassovrevaluation_modular(train_fname=traindat, label_fname=label_traindat):
	from modshogun import MulticlassOVREvaluation,ROCEvaluation
	from modshogun import MulticlassLibLinear,RealFeatures,ContingencyTableEvaluation,ACCURACY
	from modshogun import MulticlassLabels, Math, CSVFile

	Math.init_random(1)
	ground_truth_labels = MulticlassLabels(CSVFile(label_fname))
	svm = MulticlassLibLinear(1.0,RealFeatures(CSVFile(train_fname)),ground_truth_labels)
	svm.parallel.set_num_threads(1)
	svm.train()
	predicted_labels = svm.apply()

	binary_evaluator = ROCEvaluation()
	evaluator = MulticlassOVREvaluation(binary_evaluator)
	mean_roc = evaluator.evaluate(predicted_labels,ground_truth_labels)
	#print mean_roc

	binary_evaluator = ContingencyTableEvaluation(ACCURACY)
	evaluator = MulticlassOVREvaluation(binary_evaluator)
	mean_accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
	#print mean_accuracy
	return mean_roc, mean_accuracy, predicted_labels, svm

if __name__=='__main__':
	print('MulticlassOVREvaluation')
	evaluation_multiclassovrevaluation_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_prcevaluation_modular.py

# In this example PRC (Precision-Recall curve) is being computed
# for the pair of ground truth toy labels and random labels.
# PRC curve (as matrix) and auPRC (area under PRC) is returned.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_prcevaluation_modular (ground_truth, predicted):
	from modshogun import BinaryLabels
	from modshogun import PRCEvaluation

	ground_truth_labels = BinaryLabels(ground_truth)
	predicted_labels = BinaryLabels(predicted)

	evaluator = PRCEvaluation()
	evaluator.evaluate(predicted_labels,ground_truth_labels)

	return evaluator.get_PRC(), evaluator.get_auPRC()


if __name__=='__main__':
	print('PRCEvaluation')
	evaluation_prcevaluation_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_rocevaluation_modular.py

# In this example ROC (Receiver Operator Characteristic) is being computed
# for the pair of ground truth toy labels and random labels.
# ROC curve (as matrix) and auROC (area under ROC) is returned.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_rocevaluation_modular (ground_truth, predicted):
	from modshogun import BinaryLabels
	from modshogun import ROCEvaluation

	ground_truth_labels = BinaryLabels(ground_truth)
	predicted_labels = BinaryLabels(predicted)

	evaluator = ROCEvaluation()
	evaluator.evaluate(predicted_labels,ground_truth_labels)

	return evaluator.get_ROC(), evaluator.get_auROC()


if __name__=='__main__':
	print('ROCEvaluation')
	evaluation_rocevaluation_modular(*parameter_list[0])

examples/documented/python_modular/evaluation_thresholds_modular.py

#!/usr/bin/env python
parameter_list = [[1000]]

def evaluation_thresholds_modular (index):
	from modshogun import BinaryLabels, ROCEvaluation
	import numpy
	numpy.random.seed(17)
	output=numpy.arange(-1,1,0.001)
	output=(0.3*output+0.7*(numpy.random.rand(len(output))-0.5))
	label=[-1.0]*(len(output)//2)
	label.extend([1.0]*(len(output)//2))
	label=numpy.array(label)

	pred=BinaryLabels(output)
	truth=BinaryLabels(label)

	evaluator=ROCEvaluation()
	evaluator.evaluate(pred, truth)

	[fp,tp]=evaluator.get_ROC()

	thresh=evaluator.get_thresholds()
	b=thresh[index]

	#print("tpr", numpy.mean(output[label>0]>b), tp[index])
	#print("fpr", numpy.mean(output[label<0]>b), fp[index])

	return tp[index],fp[index],numpy.mean(output[label>0]>b),numpy.mean(output[label<0]>b)

if __name__=='__main__':
	print('Evaluation with Thresholds')
	evaluation_thresholds_modular(*parameter_list[0])

Features

examples/documented/python_modular/features_binned_dot_modular.py

#!/usr/bin/env python
import numpy

matrix=numpy.array([[-1.0,0,1],[2,3,4],[5,6,7]])
bins=numpy.array([[0.0, 0.0, 0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[4.0,4.0,4.0]])


parameter_list = [(matrix,bins)]

def features_binned_dot_modular (matrix, bins):
	from modshogun import RealFeatures, BinnedDotFeatures
	rf=RealFeatures(matrix)

	#print(rf.get_feature_matrix())

	bf=BinnedDotFeatures(rf, bins)
	filled=bf.get_computed_dot_feature_matrix()

	bf.set_fill(False)
	unfilled=bf.get_computed_dot_feature_matrix()

	bf.set_norm_one(True)
	unfilled_normed=bf.get_computed_dot_feature_matrix()

	bf.set_fill(True)
	filled_normed=bf.get_computed_dot_feature_matrix()

	return bf,filled,unfilled,unfilled_normed,filled_normed

if __name__=='__main__':
    print('BinnedDotFeatures')
    features_binned_dot_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_byte_modular.py

#!/usr/bin/env python
import numpy

# create dense matrix A
A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.uint8)

parameter_list=[[A]]

def features_dense_byte_modular (A):
	from modshogun import ByteFeatures

	# create dense features a
	# ... of type Byte
	a=ByteFeatures(A)

	# print(some statistics about a)
	#print(a.get_num_vectors())
	#print(a.get_num_features())

	# get first feature vector and set it
	#print(a.get_feature_vector(0))
	a.set_feature_vector(numpy.array([1,4,0,0,0,9], dtype=numpy.uint8), 0)

	# get matrix
	a_out = a.get_feature_matrix()

	#print(type(a_out), a_out.dtype)
	#print(a_out )
	assert(numpy.all(a_out==A))
	return a_out,a

if __name__=='__main__':
	print('ByteFeatures')
	features_dense_byte_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_io_modular.py

#!/usr/bin/env python
parameter_list=[[]]

def features_dense_io_modular():
	from modshogun import RealFeatures, CSVFile
	feats=RealFeatures()
	f=CSVFile("../data/fm_train_real.dat","r")
	f.set_delimiter(" ")
	feats.load(f)
	return feats

if __name__=='__main__':
	print('Dense Real Features IO')
	features_dense_io_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_longint_modular.py

#!/usr/bin/env python
from modshogun import LongIntFeatures
from numpy import array, int64, all

# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)

parameter_list = [[matrix]]

# ... of type LongInt
def features_dense_longint_modular (A=matrix):
	a=LongIntFeatures(A)
	# get first feature vector and set it

	a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0)

	# get matrix
	a_out = a.get_feature_matrix()

	assert(all(a_out==A))
	return a_out

if __name__=='__main__':
	print('dense_longint')
	features_dense_longint_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_modular.py

#!/usr/bin/env python
from modshogun import RealFeatures, LongIntFeatures, ByteFeatures
from numpy import array, float64, int64, uint8, all

# create dense matrices A,B,C

matrixA=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)
matrixB=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)
matrixC=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8)

# ... of type Real, LongInt and Byte
parameter_list = [[matrixA,matrixB,matrixC]]

def features_dense_modular (A=matrixA,B=matrixB,C=matrixC):

    a=RealFeatures(A)
    b=LongIntFeatures(B)
    c=ByteFeatures(C)

# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))


# print(some statistics about a)

# get first feature vector and set it

    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrices
    a_out = a.get_feature_matrix()
    b_out = b.get_feature_matrix()
    c_out = c.get_feature_matrix()

    assert(all(a_out==A))

    assert(all(b_out==B))

    assert(all(c_out==C))
    return a_out,b_out,c_out,a,b,c

if __name__=='__main__':
    print('dense')
    features_dense_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_protocols_modular.py

#!/usr/bin/env python
import numpy
from modshogun import RealFeatures
from modshogun import LongIntFeatures

from numpy import array, float64, int64

# create dense matrice
data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]]

parameter_list = [[data]]

def features_dense_protocols_modular (in_data=data):
	m_real=array(in_data, dtype=float64, order='F')
	f_real=RealFeatures(m_real)

	#print m_real
	#print f_real

	#print f_real[-1]
	#print f_real[1, 2]
	#print f_real[-1:3]
	#print f_real[2, 0:2]
	#print f_real[0:3, 1]
	#print f_real[0:3, 1:2]
	#print f_real[:,1]
	#print f_real[1,:]

	#print m_real[-2]
	f_real[-1]=m_real[-2]
	#print f_real[-1]

	#print m_real[0, 1]
	f_real[1,2]=m_real[0,1]
	#print f_real[1, 2]

	#print m_real[0:2]
	f_real[1:3]=m_real[0:2]
	#print f_real[1:3]

	#print m_real[0, 0:2]
	f_real[2, 0:2]=m_real[0,0:2]
	#print f_real[2, 0:2]

	#print m_real[0:3, 2]
	f_real[0:3,1]=m_real[0:3, 2]
	#print f_real[0:3, 1]

	#print m_real[0:3, 0:1]
	f_real[0:3,1:2]=m_real[0:3,0:1]
	#print f_real[0:3, 1:2]

	f_real[:,0]=0
	#print f_real.get_feature_matrix()

	if numpy.__version__ >= '1.5':
		f_real+=m_real
		f_real*=m_real
		f_real-=m_real
	else:
		print("numpy version >= 1.5 is needed")
		return None

	f_real+=f_real
	f_real*=f_real
	f_real-=f_real

	#print f_real
	#print f_real.get_feature_matrix()

	try:
		mem_real=memoryview(f_real)
	except NameError:
		print("Python2.7 and later is needed for memoryview class")
		return

	ret_real=array(f_real)
	#print ret_real

	return f_real[:,0]

if __name__=='__main__':
	print('dense_protocols')
	features_dense_protocols_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_real_modular.py

#!/usr/bin/env python
from modshogun import RealFeatures
from numpy import array, float64, all

# create dense matrices A,B,C
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64)

parameter_list = [[matrix]]

# ... of type LongInt
def features_dense_real_modular (A=matrix):

# ... of type Real, LongInt and Byte
    a=RealFeatures(A)

# print(some statistics about a)
#print(a.get_num_vectors())
#print(a.get_num_features())

# get first feature vector and set it
#print(a.get_feature_vector(0))
    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrix
    a_out = a.get_feature_matrix()

    assert(all(a_out==A))
    return a_out


if __name__=='__main__':
    print('dense_real')
    features_dense_real_modular(*parameter_list[0])

examples/documented/python_modular/features_dense_zero_copy_modular.py

#!/usr/bin/env python
import numpy
from modshogun import RealFeatures
from numpy import array, float64, int64

# create dense matrice
data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]]

parameter_list = [[data]]

def features_dense_zero_copy_modular (in_data=data):
	feats = None
	if numpy.__version__ >= '1.5':
		feats=numpy.array(in_data, dtype=float64, order='F')

		a=RealFeatures()
		a.frombuffer(feats, False)

		b=numpy.array(a, copy=False)
		c=numpy.array(a, copy=True)

		d=RealFeatures()
		d.frombuffer(a, False)

		e=RealFeatures()
		e.frombuffer(a, True)

		a[:,0]=0
		#print a[0:4]
		#print b[0:4]
		#print c[0:4]
		#print d[0:4]
		#print e[0:4]
	else:
		print("numpy version >= 1.5 is needed")

	return feats

if __name__=='__main__':
	print('dense_zero_copy')
	features_dense_zero_copy_modular(*parameter_list[0])

examples/documented/python_modular/features_director_dot_modular.py

#!/usr/bin/env python
import numpy
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat,0.9,1e-3],[traindat,testdat,label_traindat,0.8,1e-2]]

def features_director_dot_modular (fm_train_real, fm_test_real,
		label_train_twoclass, C, epsilon):
	try:
		from modshogun import DirectorDotFeatures
		from modshogun import RealVector
	except ImportError:
		print("recompile shogun with --enable-swig-directors")
		return

	class NumpyFeatures(DirectorDotFeatures):

		# variables
		data=numpy.empty((1,1))

		# constructor
		def __init__(self, d):
			DirectorDotFeatures.__init__(self)
			self.data = d

		# overloaded methods
		def add_to_dense_sgvec(self, alpha, vec_idx1, vec2, abs):
			if abs:
				vec2+=alpha*numpy.abs(self.data[:,vec_idx1])
			else:
				vec2+=alpha*self.data[:,vec_idx1]

		def dot(self, vec_idx1, df, vec_idx2):
			return numpy.dot(self.data[:,vec_idx1], df.get_computed_dot_feature_vector(vec_idx2))

		def dense_dot_sgvec(self, vec_idx1, vec2):
			return numpy.dot(self.data[:,vec_idx1], vec2[0:vec2.vlen])

		def get_num_vectors(self):
			return self.data.shape[1]

		def get_dim_feature_space(self):
			return self.data.shape[0]

		# operators
	#	def __add__(self, other):
	#		return NumpyFeatures(self.data+other.data)

	#	def __sub__(self, other):
	#		return NumpyFeatures(self.data-other.data)

	#	def __iadd__(self, other):
	#		return NumpyFeatures(self.data+other.data)

	#	def __isub__(self, other):
	#		return NumpyFeatures(self.data-other.data)


	#from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels
	#from modshogun import LibLinear, L2R_L2LOSS_SVC_DUAL
	#from modshogun import Math_init_random
	#Math_init_random(17)

	#feats_train=RealFeatures(fm_train_real)
	#feats_test=RealFeatures(fm_test_real)
	#labels=BinaryLabels(label_train_twoclass)

	#dfeats_train=NumpyFeatures(fm_train_real)
	#dfeats_test=NumpyFeatures(fm_test_real)
	#dlabels=BinaryLabels(label_train_twoclass)

	#print feats_train.get_computed_dot_feature_matrix()
	#print dfeats_train.get_computed_dot_feature_matrix()

	#svm=LibLinear(C, feats_train, labels)
	#svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	#svm.set_epsilon(epsilon)
	#svm.set_bias_enabled(True)
	#svm.train()

	#svm.set_features(feats_test)
	#svm.apply().get_labels()
	#predictions = svm.apply()

	#dfeats_train.__disown__()
	#dfeats_train.parallel.set_num_threads(1)
	#dsvm=LibLinear(C, dfeats_train, dlabels)
	#dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL)
	#dsvm.set_epsilon(epsilon)
	#dsvm.set_bias_enabled(True)
	#dsvm.train()

	#dfeats_test.__disown__()
	#dfeats_test.parallel.set_num_threads(1)
	#dsvm.set_features(dfeats_test)
	#dsvm.apply().get_labels()
	#dpredictions = dsvm.apply()

	#return predictions, svm, predictions.get_labels()

if __name__=='__main__':
	print('DirectorLinear')
	features_director_dot_modular(*parameter_list[0])

examples/documented/python_modular/features_hasheddocdot_modular.py

#!/usr/bin/env python
strings=['hey','guys','i','am','a','string']

parameter_list=[[strings]]

def features_hasheddocdot_modular(strings):
	from modshogun import StringCharFeatures, RAWBYTE
	from modshogun import HashedDocDotFeatures
	from modshogun import NGramTokenizer
	from numpy import array

	#create string features
	f=StringCharFeatures(strings, RAWBYTE)

	#set the number of bits of the target dimension
	#means a dim of size 2^5=32
	num_bits=5

	#create the ngram tokenizer of size 8 to parse the strings
	tokenizer=NGramTokenizer(8)

	#normalize results
	normalize=True

	#create HashedDocDot features
	hddf=HashedDocDotFeatures(num_bits, f, tokenizer, normalize)

	#should expect 32
	#print('Feature space dimensionality is', hddf.get_dim_feature_space())

	#print('Self dot product of string 0', hddf.dot(0, hddf, 0))

	return hddf

if __name__=='__main__':
	print('HashedDocDotFeatures')
	features_hasheddocdot_modular(*parameter_list[0])

examples/documented/python_modular/features_io_modular.py

# This example shows how to read and write plain ascii files, binary files and
# hdf5 datasets.
# 
# For ascii files it shows how to obtain shogun's RealFeatures
# (a simple feature matrix of doubles with 1 column == 1 example, nr_columns ==
#  number of examples) and also sparse features in SVM light format.
# 
# Binary files use some custom native format and datasets can be read/written
# from/to hdf5 files with arbitrary group / path.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_numbers('../data/fm_train_real.dat')
label=lm.load_numbers('../data/label_train_twoclass.dat')

parameter_list=[[data,label]]

def features_io_modular (fm_train_real, label_train_twoclass):
	import numpy
	from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel
	from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File

	feats=SparseRealFeatures(fm_train_real)
	feats2=SparseRealFeatures()

	f=BinaryFile("tmp/fm_train_sparsereal.bin","w")
	feats.save(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w")
	feats.save(f)

	f=BinaryFile("tmp/fm_train_sparsereal.bin")
	feats2.load(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii")
	feats2.load(f)

	feats=RealFeatures(fm_train_real)
	feats2=RealFeatures()

	f=BinaryFile("tmp/fm_train_real.bin","w")
	feats.save(f)

	f=HDF5File("tmp/fm_train_real.h5","w", "/data/doubles")
	feats.save(f)

	f=CSVFile("tmp/fm_train_real.ascii","w")
	feats.save(f)

	f=BinaryFile("tmp/fm_train_real.bin")
	feats2.load(f)
	#print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	f=CSVFile("tmp/fm_train_real.ascii")
	feats2.load(f)
	#print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0]))
	lab2=MulticlassLabels()
	f=CSVFile("tmp/label_train_twoclass.ascii","w")
	lab.save(f)

	f=BinaryFile("tmp/label_train_twoclass.bin","w")
	lab.save(f)

	f=HDF5File("tmp/label_train_real.h5","w", "/data/labels")
	lab.save(f)

	f=CSVFile("tmp/label_train_twoclass.ascii")
	lab2.load(f)

	f=BinaryFile("tmp/label_train_twoclass.bin")
	lab2.load(f)

	f=HDF5File("tmp/fm_train_real.h5","r", "/data/doubles")
	feats2.load(f)
	#print(feats2.get_feature_matrix())
	f=HDF5File("tmp/label_train_real.h5","r", "/data/labels")
	lab2.load(f)
	#print(lab2.get_labels())

	#clean up
	import os
	for f in ['tmp/fm_train_sparsereal.bin','tmp/fm_train_sparsereal.ascii',
			'tmp/fm_train_real.bin','tmp/fm_train_real.h5','tmp/fm_train_real.ascii',
			'tmp/label_train_real.h5', 'tmp/label_train_twoclass.ascii','tmp/label_train_twoclass.bin']:
		os.unlink(f)
	return feats, feats2, lab, lab2

if __name__=='__main__':
	print('Features IO')
	features_io_modular(*parameter_list[0])

examples/documented/python_modular/features_read_svmlight_format_modular.py

# This example demonstrates how to read and write data in the SVMLight Format
# from Shogun.
# 

#!/usr/bin/env python
parameter_list=[['../data/train_sparsereal.light']]

def features_read_svmlight_format_modular (fname):
	import os
	from modshogun import SparseRealFeatures
	from modshogun import LibSVMFile

	f=SparseRealFeatures()
	lab=f.load_with_labels(LibSVMFile(fname))
	f.save_with_labels(LibSVMFile('tmp/testwrite.light', 'w'), lab)
	os.unlink('tmp/testwrite.light')

if __name__=='__main__':
	print('Reading SVMLIGHT format')
	features_read_svmlight_format_modular(*parameter_list[0])

examples/documented/python_modular/features_snp_modular.py

# Creates features similar to the feature space of the SNP kernel. Useful when
# working with linear methods.

#!/usr/bin/env python
parameter_list=[['../data/snps.dat']]

def features_snp_modular (fname):
	from modshogun import StringByteFeatures, SNPFeatures, SNP

	sf=StringByteFeatures(SNP)
	sf.load_ascii_file(fname, False, SNP, SNP)
	#print(sf.get_features())
	snps=SNPFeatures(sf)
	#print(snps.get_feature_matrix())
	#print(snps.get_minor_base_string())
	#print(snps.get_major_base_string())

if __name__=='__main__':
	print('SNP Features')
	features_snp_modular(*parameter_list[0])

examples/documented/python_modular/features_sparse_modular.py

# This example demsonstrates how to encode sparse (most entries zero),
# real-valued features in shogun using SparseRealFeatures.

#!/usr/bin/env python
import numpy
# create dense matrix A
A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.float64)

parameter_list=[[A]]
def features_sparse_modular (A):
	from scipy.sparse import csc_matrix
	from modshogun import SparseRealFeatures
	from numpy import array, float64, all

	# sparse representation X of dense matrix A
	# note, will work with types other than float64 too,
	# but requires recent scipy.sparse
	X=csc_matrix(A)
	#print(A)

	# create sparse shogun features from dense matrix A
	a=SparseRealFeatures(A)
	a_out=a.get_full_feature_matrix()
	#print(a_out)
	assert(all(a_out==A))
	#print(a_out)

	# create sparse shogun features from sparse matrix X
	a.set_sparse_feature_matrix(X)
	a_out=a.get_full_feature_matrix()
	#print(a_out)
	assert(all(a_out==A))

	# create sparse shogun features from sparse matrix X
	a=SparseRealFeatures(X)
	a_out=a.get_full_feature_matrix()
	#print(a_out)
	assert(all(a_out==A))

	# obtain (data,row,indptr) csc arrays of sparse shogun features
	z=csc_matrix(a.get_sparse_feature_matrix())
	z_out=z.todense()
	#print(z_out)
	assert(all(z_out==A))

if __name__=='__main__':
	print('Sparse Features')
	features_sparse_modular(*parameter_list[0])

examples/documented/python_modular/features_string_char_compressed_modular.py

# This example demonstrates how to use compressed strings with shogun.
# We currently support reading and writing compressed files using
# LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract
# compressed streams on-the-fly in order to fit data sets into
# memory that would be too large, otherwise.
# 

#!/usr/bin/env python
parameter_list = [['features_string_char_compressed_modular.py']]

def features_string_char_compressed_modular (fname):
	from modshogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE
	from modshogun import UNCOMPRESSED,SNAPPY,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG
	from modshogun import DecompressCharString

	f=StringFileCharFeatures(fname, RAWBYTE)

	#print("original strings", f.get_features())

	#uncompressed
	f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_uncompressed.str", True)
	#print("uncompressed strings", f2.get_features())
	#print

	# load compressed data and uncompress on load

	#snappy - not stable yet?!
	#f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9)
	#f2=StringCharFeatures(RAWBYTE);
	#f2.load_compressed("tmp/foo_snappy.str", True)
	#print("snappy strings", f2.get_features())
	#print

	#lzo
	f.save_compressed("tmp/foo_lzo.str", LZO, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", True)
	#print("lzo strings", f2.get_features())
	#print

	##gzip
	f.save_compressed("tmp/foo_gzip.str", GZIP, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_gzip.str", True)
	#print("gzip strings", f2.get_features())
	#print

	#bzip2
	f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_bzip2.str", True)
	#print("bzip2 strings", f2.get_features())
	#print

	#lzma
	f.save_compressed("tmp/foo_lzma.str", LZMA, 9)
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzma.str", True)
	#print("lzma strings", f2.get_features())
	#print

	# load compressed data and uncompress via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", False)
	f2.add_preprocessor(DecompressCharString(LZO))
	f2.apply_preprocessor()
	#print("lzo strings", f2.get_features())
	#print

	# load compressed data and uncompress on-the-fly via preprocessor
	f2=StringCharFeatures(RAWBYTE);
	f2.load_compressed("tmp/foo_lzo.str", False)
	#f2.io.set_loglevel(MSG_DEBUG)
	f2.add_preprocessor(DecompressCharString(LZO))
	f2.enable_on_the_fly_preprocessing()
	#print("lzo strings", f2.get_features())
	#print

	#clean up
	import os
	for f in ['tmp/foo_uncompressed.str', 'tmp/foo_snappy.str', 'tmp/foo_lzo.str', 'tmp/foo_gzip.str',
	'tmp/foo_bzip2.str', 'tmp/foo_lzma.str', 'tmp/foo_lzo.str', 'tmp/foo_lzo.str']:
		if os.path.exists(f):
			os.unlink(f)

	##########################################################################################
	# some perfectly compressible stuff follows
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################
	##########################################################################################

if __name__=='__main__':
    print('Compressing StringCharFileFeatures')
    features_string_char_compressed_modular(*parameter_list[0])

examples/documented/python_modular/features_string_char_modular.py

# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun.

#!/usr/bin/env python
strings=['hey','guys','i','am','a','string']

parameter_list=[[strings]]

def features_string_char_modular (strings):
	from modshogun import StringCharFeatures, RAWBYTE
	from numpy import array

	#create string features
	f=StringCharFeatures(strings, RAWBYTE)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("string[5]", ''.join(f.get_feature_vector(5)))
	#print("strings", f.get_features())

	#replace string 0
	f.set_feature_vector(array(['t','e','s','t']), 0)

	#print("strings", f.get_features())
	return f.get_features(), f

if __name__=='__main__':
	print('StringCharFeatures')
	features_string_char_modular(*parameter_list[0])

examples/documented/python_modular/features_string_file_char_modular.py

# This example demonstrates how to load ASCII features from a file into shogun.

#!/usr/bin/env python
parameter_list = [['features_string_file_char_modular.py']]

def features_string_file_char_modular (fname):
	from modshogun import StringFileCharFeatures, RAWBYTE
	f = StringFileCharFeatures(fname, RAWBYTE)
	#print("strings", f.get_features())
	return f

if __name__=='__main__':
    print('Compressing StringCharFileFeatures')
    features_string_file_char_modular(*parameter_list[0])

examples/documented/python_modular/features_string_file_modular.py

# This example demonstrates how to load string features from files.
# We cover two cases: First, we show how to obtain StringCharFeatues
# from a directory of text files (particularly useful in computational biology)
# and second, we demonstrate how to load StringCharFeatues from one (multi-line) file.
# 

#!/usr/bin/env python
parameter_list=[[".", "features_string_char_modular.py"]]

def features_string_file_modular (directory, fname):
	from modshogun import StringCharFeatures, RAWBYTE
	from modshogun import CSVFile

	# load features from directory
	f=StringCharFeatures(RAWBYTE)
	f.load_from_directory(directory)

	#and output several stats
	#print("max string length", f.get_max_vector_length())
	#print("number of strings", f.get_num_vectors())
	#print("length of first string", f.get_vector_length(0))
	#print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
	#print("len(str[0])", f.get_vector_length(0))
	#print("str[0]", f.get_feature_vector(0))

	#or load features from file (one string per line)
	fil=CSVFile(fname)
	f.load(fil)
	#print(f.get_features())

	#or load fasta file
	#f.load_fasta('fasta.fa')
	#print(f.get_features())
	return f.get_features(), f

if __name__=='__main__':
	print('StringWordFeatures')
	features_string_file_modular(*parameter_list[0])

examples/documented/python_modular/features_string_hashed_wd_modular.py

# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted
# Degree kernel feature space via hashes. These features can be particularly fast
# in linear SVM solvers.

#!/usr/bin/env python
from modshogun import LongIntFeatures
from numpy import array, int64, all

# create dense matrix A
matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64)

parameter_list = [[matrix,3,1,2],[matrix,3,1,2]]

# ... of type LongInt
def features_string_hashed_wd_modular (A=matrix,order=3,start_order=1,hash_bits=2):
    a=LongIntFeatures(A)

    from numpy import array, uint8
    from modshogun import HashedWDFeatures, StringByteFeatures, RAWDNA
    from modshogun import MSG_DEBUG

    x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)]
    from_order=order
    f=StringByteFeatures(RAWDNA)
    #f.io.set_loglevel(MSG_DEBUG)
    f.set_features(x)

    y=HashedWDFeatures(f,start_order,order,from_order,hash_bits)
    fm=y.get_computed_dot_feature_matrix()

    return fm

if __name__=='__main__':
    print('string_hashed_wd')
    features_string_hashed_wd_modular(*parameter_list[0])

examples/documented/python_modular/features_string_sliding_window_modular.py

# In this example, we demonstrate how to obtain string features
# by using a sliding window in a memory-efficient way. Instead of copying
# the string for each position of the sliding window, we only store a reference
# with respect to the complete string. This is particularly useful, when working
# with genomic data, where storing all explicitly copied strings in memory
# quickly becomes infeasible. In addition to a sliding window (of a particular
# length) over all position, we also support defining a custom position
# list.

#!/usr/bin/env python
# create string features with a single string
s=10*'A' + 10*'C' + 10*'G' + 10*'T'

parameter_list=[[s]]

def features_string_sliding_window_modular (strings):
	from modshogun import StringCharFeatures, DNA
	from modshogun import DynamicIntArray

	f=StringCharFeatures([strings], DNA)

	# slide a window of length 5 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(5,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# slide a window of length 4 over features
	# (memory efficient, does not copy strings)
	f.obtain_by_sliding_window(4,1)
	#print(f.get_num_vectors())
	#print(f.get_vector_length(0))
	#print(f.get_vector_length(1))
	#print(f.get_features())

	# extract string-windows at position 0,6,16,25 of window size 4
	# (memory efficient, does not copy strings)
	f.set_features([s])
	positions=DynamicIntArray()
	positions.append_element(0)
	positions.append_element(6)
	positions.append_element(16)
	positions.append_element(25)

	f.obtain_by_position_list(4,positions)
	#print(f.get_features())

	# now extract windows of size 8 from same positon list
	f.obtain_by_position_list(8,positions)
	#print(f.get_features())
	return f

if __name__=='__main__':
	print('Sliding Window')
	features_string_sliding_window_modular(*parameter_list[0])

examples/documented/python_modular/features_string_ulong_modular.py

# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 64bit (ulong).

#!/usr/bin/env python

parameter_list = [[0,2,0,False],[0,3,0,False]]

def features_string_ulong_modular (start=0,order=2,gap=0,rev=False):

    from modshogun import StringCharFeatures, StringUlongFeatures, RAWBYTE
    from numpy import array, uint64

#create string features
    cf=StringCharFeatures(['hey','guys','string'], RAWBYTE)
    uf=StringUlongFeatures(RAWBYTE)

    uf.obtain_from_char(cf, start,order,gap,rev)

#replace string 0
    uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0)


    return uf.get_features(),uf.get_feature_vector(2), uf.get_num_vectors()

if __name__=='__main__':
    print('simple_longint')
    features_string_ulong_modular(*parameter_list[0])

examples/documented/python_modular/features_string_word_modular.py

# This example demonstrates how to encode string
# features efficiently by creating a more compactly encoded
# bit-string from StringCharFeatures.
# For instance, when working with the DNA alphabet {A,T,G,C}
# using 1 char = 1 byte per symbol would be wasteful, as we
# can encode 4 symbols using 2 bits only.
# Here, this is done in junks of 16bit (word).

#!/usr/bin/env python
strings=['hey','guys','string']

parameter_list=[[strings,0,2,0,False]]

def features_string_word_modular (strings, start, order, gap, rev):
	from modshogun import StringCharFeatures, StringWordFeatures, RAWBYTE
	from numpy import array, uint16

	#create string features
	cf=StringCharFeatures(strings, RAWBYTE)
	wf=StringWordFeatures(RAWBYTE)

	wf.obtain_from_char(cf, start, order, gap, rev)

	#and output several stats
	#print("max string length", wf.get_max_vector_length())
	#print("number of strings", wf.get_num_vectors())
	#print("length of first string", wf.get_vector_length(0))
	#print("string[2]", wf.get_feature_vector(2))
	#print("strings", wf.get_features())

	#replace string 0
	wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0)

	#print("strings", wf.get_features())
	return wf.get_features(), wf

if __name__=='__main__':
	print('StringWordFeatures')
	features_string_word_modular(*parameter_list[0])

Kernel

examples/documented/python_modular/kernel_anova_modular.py

# In this example the ANOVA kernel is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list = [[traindat,testdat,2,10], [traindat,testdat,5,10]]

def kernel_anova_modular (train_fname=traindat,test_fname=testdat,cardinality=2, size_cache=10):
	from modshogun import ANOVAKernel,RealFeatures,CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train, km_test, kernel

if __name__=='__main__':
	print('ANOVA')
	kernel_anova_modular(*parameter_list[0])

examples/documented/python_modular/kernel_auc_modular.py

# This example demonstrates the use of the AUC Kernel, which
# can be used to maximize AUC instead of margin in SVMs.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
label_traindat = '../data/label_train_twoclass.dat'
parameter_list = [[traindat,label_traindat,1.7], [traindat,label_traindat,1.6]]

def kernel_auc_modular (train_fname=traindat,label_fname=label_traindat,width=1.7):
	from modshogun import GaussianKernel, AUCKernel, RealFeatures
	from modshogun import BinaryLabels, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	subkernel=GaussianKernel(feats_train, feats_train, width)

	kernel=AUCKernel(0, subkernel)
	kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname)))
	km_train=kernel.get_kernel_matrix()
	return kernel

if __name__=='__main__':
	print('AUC')
	kernel_auc_modular(*parameter_list[0])

examples/documented/python_modular/kernel_cauchy_modular.py

# In this example the Cauchy kernel is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]]

def kernel_cauchy_modular (train_fname=traindat,test_fname=testdat, sigma=1.0):
	from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance
	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	kernel=CauchyKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Cauchy')
	kernel_cauchy_modular(*parameter_list[0])

examples/documented/python_modular/kernel_chi2_modular.py

# This is an example for the initialization of the chi2-kernel on real data, where
# each column of the matrices corresponds to one training/test example.

#!/usr/bin/env python
traindat = '../data/fm_train_hist.dat'
testdat = '../data/fm_test_hist.dat'

parameter_list = [[traindat,testdat,1.4,10], [traindat,testdat,1.5,10]]

def kernel_chi2_modular (train_fname=traindat,test_fname=testdat,width=1.4, size_cache=10):
	from modshogun import RealFeatures, Chi2Kernel, CSVFile, NormOne

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Chi2')
	kernel_chi2_modular(*parameter_list[0])

examples/documented/python_modular/kernel_circular_modular.py

# In this example the circular kernel is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_circular_modular(train_fname=traindat,test_fname=testdat, sigma=1.0):
	from modshogun import RealFeatures, CircularKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	kernel=CircularKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Circular')
	kernel_circular_modular(*parameter_list[0])

examples/documented/python_modular/kernel_combined_custom_poly_modular.py

# In this example the combined kernel of custom kernel and poly kernel is being computed for toy data.

#!/usr/bin/env python

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_twoclass.dat'

parameter_list= [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]


def kernel_combined_custom_poly_modular (train_fname = traindat,test_fname = testdat,train_label_fname=label_traindat):
    from modshogun import CombinedFeatures, RealFeatures, BinaryLabels
    from modshogun import CombinedKernel, PolyKernel, CustomKernel
    from modshogun import LibSVM, CSVFile

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()

    tfeats = RealFeatures(CSVFile(train_fname))
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_train = RealFeatures(CSVFile(train_fname))
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10,2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = BinaryLabels(CSVFile(train_label_fname))
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(CSVFile(test_fname))
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(CSVFile(test_fname))
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.apply()
    km_train=kernel.get_kernel_matrix()
    return km_train,kernel

if __name__=='__main__':
    kernel_combined_custom_poly_modular(*parameter_list[0])

examples/documented/python_modular/kernel_combined_modular.py

# This is an example for the initialization of a combined kernel, which is a weighted sum of
# in this case three kernels on real valued data. The sub-kernel weights are all set to 1.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()

traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = double(lm.load_numbers('../data/fm_test_real.dat'))
traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,traindna,testdna],[traindat,testdat,traindna,testdna]]
def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Combined')
	kernel_combined_modular(*parameter_list[0])

examples/documented/python_modular/kernel_comm_ulong_string_modular.py

# This is an example for the initialization of the CommUlongString-kernel. This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat =lm.load_dna('../data/fm_train_dna.dat')
testdat =  lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,3,0,False ],[traindat,testdat,4,0,False]]

def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False):

	from modshogun import CommUlongStringKernel
	from modshogun import StringUlongFeatures, StringCharFeatures, DNA
	from modshogun import SortUlongString

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('CommUlongString')
	kernel_comm_ulong_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_comm_word_string_modular.py

# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted
# only once.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list = [[traindat,testdat,4,0,False, False],[traindat,testdat,4,0,False,False]]

def kernel_comm_word_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse = False, use_sign = False):

	from modshogun import CommWordStringKernel
	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import SortWordString

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('CommWordString')
	kernel_comm_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_const_modular.py

# The constant kernel gives a trivial kernel matrix with all entries set to the same value
# defined by the argument 'c'.
# 

#!/usr/bin/env python
parameter_list =[[23],[24]]

def kernel_const_modular (c=23):
	from modshogun import DummyFeatures
	from modshogun import ConstKernel

	feats_train=DummyFeatures(10)
	feats_test=DummyFeatures(17)

	kernel=ConstKernel(feats_train, feats_train, c)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Const')
	kernel_const_modular(*parameter_list[0])

examples/documented/python_modular/kernel_custom_modular.py

# A user defined custom kernel is assigned in this example, for which only the lower triangle
# may be given (set_triangle_kernel_matrix_from_triangle) or
# a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a
# triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and
# the svm is used to classify the examples.
# 

#!/usr/bin/env python
from numpy.random import seed
seed(42)

parameter_list=[[7],[8]]

def kernel_custom_modular (dim=7):
	from numpy.random import rand, seed
	from numpy import array, float32, int32
	from modshogun import RealFeatures
	from modshogun import CustomKernel
	from modshogun import IndexFeatures

	seed(17)
	data=rand(dim, dim)
	feats=RealFeatures(data)
	symdata=data+data.T
	lowertriangle=array([symdata[(x,y)] for x in range(symdata.shape[1])
		for y in range(symdata.shape[0]) if y<=x])

	kernel=CustomKernel()

	# once with float64's
	kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle)
	km_triangletriangle=kernel.get_kernel_matrix()

	kernel.set_triangle_kernel_matrix_from_full(symdata)
	km_fulltriangle=kernel.get_kernel_matrix()

	kernel.set_full_kernel_matrix_from_full(symdata)
	km_fullfull=kernel.get_kernel_matrix()

	# get subset of kernel
	row_idx=array(range(3),dtype=int32)
	col_idx=array(range(2),dtype=int32)
	row_idx_feat=IndexFeatures(row_idx)
	col_idx_feat=IndexFeatures(col_idx)
	kernel.init(row_idx_feat, col_idx_feat)
	km_sub_kernel=kernel.get_kernel_matrix()
	# print('Subkernel(3x2):\n%s'%km_sub_kernel)
	kernel.remove_all_col_subsets()
	kernel.remove_all_row_subsets()

	# now once with float32's
	data=array(data,dtype=float32)

	kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle)
	km_triangletriangle=kernel.get_kernel_matrix()

	kernel.set_triangle_kernel_matrix_from_full(symdata)
	km_fulltriangle=kernel.get_kernel_matrix()

	kernel.set_full_kernel_matrix_from_full(symdata)
	km_fullfull=kernel.get_kernel_matrix()
	return km_fullfull,kernel,km_sub_kernel

if __name__=='__main__':
	print('Custom')
	kernel_custom_modular(*parameter_list[0])

examples/documented/python_modular/kernel_diag_modular.py

# This is an example for the initialization of the diag-kernel.
# The diag kernel has all kernel matrix entries but those on
# the main diagonal set to zero.

#!/usr/bin/env python
parameter_list =[[23],[24]]
def kernel_diag_modular (diag=23):
	from modshogun import DummyFeatures
	from modshogun import DiagKernel

	feats_train=DummyFeatures(10)
	feats_test=DummyFeatures(17)

	kernel=DiagKernel(feats_train, feats_train, diag)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Diag')
	kernel_diag_modular(*parameter_list[0])

examples/documented/python_modular/kernel_director_linear_modular.py

#!/usr/bin/env python
import numpy
from modshogun import RealFeatures, MSG_DEBUG
traindat = numpy.random.random_sample((10,10))
testdat = numpy.random.random_sample((10,10))
parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]

def kernel_director_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
	try:
		from modshogun import DirectorKernel
	except ImportError:
		print("recompile shogun with --enable-swig-directors")
		return

	class DirectorLinearKernel(DirectorKernel):
		def __init__(self):
			DirectorKernel.__init__(self, True)
		def kernel_function(self, idx_a, idx_b):
			seq1 = self.get_lhs().get_feature_vector(idx_a)
			seq2 = self.get_rhs().get_feature_vector(idx_b)
			return numpy.dot(seq1, seq2)


	from modshogun import LinearKernel, AvgDiagKernelNormalizer
	from modshogun import Time

	feats_train=RealFeatures(fm_train_real)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_train.parallel.set_num_threads(1)
	feats_test=RealFeatures(fm_test_real)

	kernel=LinearKernel()
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)

	dkernel=DirectorLinearKernel()
	dkernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	dkernel.init(feats_train, feats_train)

	#print  "km_train"
	t=Time()
	km_train=kernel.get_kernel_matrix()
	#t1=t.cur_time_diff(True)

	#print  "dkm_train"
	t=Time()
	dkm_train=dkernel.get_kernel_matrix()
	#t2=t.cur_time_diff(True)

	#print "km_train", km_train
	#print "dkm_train", dkm_train

	return km_train, dkm_train

if __name__=='__main__':
	print('DirectorLinear')
	kernel_director_linear_modular(*parameter_list[0])

examples/documented/python_modular/kernel_distance_modular.py

# With the distance kernel one can use any of the following distance metrics:
# BrayCurtisDistance()
# CanberraMetric()
# CanberraWordDistance()
# ChebyshewMetric()
# ChiSquareDistance()
# CosineDistance()
# Distance()
# EuclidianDistance()
# GeodesicMetric()
# HammingWordDistance()
# JensenMetric()
# ManhattanMetric()
# ManhattanWordDistance()
# MinkowskiMetric()
# RealDistance()
# SimpleDistance()
# SparseDistance()
# SparseEuclidianDistance()
# StringDistance()
# TanimotoDistance()
# 

#!/usr/bin/env python
testdat = '../data/fm_train_real.dat'
traindat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat,1.7],[traindat,testdat,1.8]]

def kernel_distance_modular (train_fname=traindat,test_fname=testdat,width=1.7):
	from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance()
	kernel=DistanceKernel(feats_train, feats_test, width, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Distance')
	kernel_distance_modular(*parameter_list[0])

examples/documented/python_modular/kernel_distantsegments_modular.py

# In this example the distant segments kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,5,5],[traindat,testdat,6,6]]

def kernel_distantsegments_modular (fm_train_dna=traindat,fm_test_dna=testdat,delta=5, theta=5):
	from modshogun import StringCharFeatures, DNA
	from modshogun import DistantSegmentsKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train, km_test, kernel


if __name__=='__main__':
	print('DistantSegments')
	kernel_distantsegments_modular(*parameter_list[0])

examples/documented/python_modular/kernel_exponential_modular.py

# In this example the exponential kernel is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0):
	from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance = EuclideanDistance(feats_train, feats_train)
	kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Exponential')
	kernel_exponential_modular(*parameter_list[0])

examples/documented/python_modular/kernel_fisher_modular.py

# The class FKFeatures implements Fischer kernel features obtained from
# two Hidden Markov models.
# 
# It was used in
# 
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
# 
# which also has the details.
# 
# Note that FK-features are computed on the fly, so to be effective feature
# caching should be enabled.
# 
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list = [[traindat,testdat,label_traindat,1,4,1e-1,1,0,False,[1,False,True]],[traindat,testdat,label_traindat,3,4,1e-1,1,0,False,[1,False,True]]]

fm_hmm_pos=[ traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[ traindat[i] for i in where([label_traindat==-1])[1] ]

def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat,
		label_train_dna=label_traindat,
		N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False,
		kargs=[1,False,True]):

	from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL#, MSG_DEBUG

	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=FKFeatures(10, pos, neg)
	feats_train.set_opt_a(-1) #estimate prior
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=FKFeatures(10, pos_clone, neg_clone)
	feats_test.set_a(feats_train.get_a()) #use prior from training data
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print("Fisher Kernel")
	kernel_fisher_modular(*parameter_list[0])

examples/documented/python_modular/kernel_fixed_degree_string_modular.py

# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat, testdat,3],[traindat,testdat,4]]

def kernel_fixed_degree_string_modular (fm_train_dna=traindat, fm_test_dna=testdat,degree=3):
	from modshogun import StringCharFeatures, DNA
	from modshogun import FixedDegreeStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=FixedDegreeStringKernel(feats_train, feats_train, degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
if __name__=='__main__':
	print('FixedDegreeString')
	kernel_fixed_degree_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_gaussian_modular.py

# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.3],[traindat,testdat, 1.4]]

def kernel_gaussian_modular (train_fname=traindat,test_fname=testdat, width=1.3):
	from modshogun import RealFeatures, GaussianKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Gaussian')
	kernel_gaussian_modular(*parameter_list[0])

examples/documented/python_modular/kernel_gaussian_shift_modular.py

# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel.
# The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step
# size of the shifts and  max_shift is the maximal shift.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat,1.8,2,1],[traindat,testdat,1.9,2,1]]

def kernel_gaussian_shift_modular (train_fname=traindat,test_fname=testdat,width=1.8,max_shift=2,shift_step=1):
	from modshogun import RealFeatures, GaussianShiftKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('GaussianShift')
	kernel_gaussian_shift_modular(*parameter_list[0])

examples/documented/python_modular/kernel_histogram_word_string_modular.py

# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindat,testdat,label_traindat,1,1e1, 1e0],[traindat,testdat,label_traindat,1,1e4,1e4]]

def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1):

	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer
	from modshogun import PluginEstimate#, MSG_DEBUG

	charfeat=StringCharFeatures(DNA)
	#charfeat.io.set_loglevel(MSG_DEBUG)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, 0, False)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, 0, False)

	pie=PluginEstimate(ppseudo_count,npseudo_count)
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=HistogramWordStringKernel(feats_train, feats_train, pie)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('PluginEstimate w/ HistogramWord')
	kernel_histogram_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_inversemultiquadric_modular.py

# In this example the inverse multiquadic kernel is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_inversemultiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
	from modshogun import RealFeatures, InverseMultiQuadricKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	kernel=InverseMultiQuadricKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('InverseMultiquadric')
	kernel_inversemultiquadric_modular(*parameter_list[0])

examples/documented/python_modular/kernel_io_modular.py

# example on saving a kernel to a file

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat,1.9],[traindat,testdat,1.7]]

def kernel_io_modular (train_fname=traindat,test_fname=testdat,width=1.9):
	from modshogun import RealFeatures, GaussianKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)
	km_train=kernel.get_kernel_matrix()
	f=CSVFile("tmp/gaussian_train.csv","w")
	kernel.save(f)
	del f

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	f=CSVFile("tmp/gaussian_test.csv","w")
	kernel.save(f)
	del f

	#clean up
	import os
	os.unlink("tmp/gaussian_test.csv")
	os.unlink("tmp/gaussian_train.csv")

	return km_train, km_test, kernel

if __name__=='__main__':
	print('Gaussian')
	kernel_io_modular(*parameter_list[0])

examples/documented/python_modular/kernel_linear_byte_modular.py

# This is an example for the initialization of a linear kernel on raw byte
# data.

#!/usr/bin/env python
traindat = '../data/fm_train_byte.dat'
testdat = '../data/fm_test_byte.dat'

parameter_list=[[traindat,testdat],[traindat,testdat]]

def kernel_linear_byte_modular (train_fname=traindat,test_fname=testdat):
	from modshogun import LinearKernel, ByteFeatures, CSVFile

	feats_train=ByteFeatures(CSVFile(train_fname))
	feats_test=ByteFeatures(CSVFile(test_fname))

	kernel=LinearKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return kernel

if __name__=='__main__':
	print('LinearByte')
	kernel_linear_byte_modular(*parameter_list[0])

examples/documented/python_modular/kernel_linear_modular.py

# This is an example for the initialization of a linear kernel on real valued
# data using scaling factor 1.2.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]]

def kernel_linear_modular (train_fname=traindat,test_fname=testdat,scale=1.2):

	from modshogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=LinearKernel()
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Linear')
	kernel_linear_modular(*parameter_list[0])

examples/documented/python_modular/kernel_linear_string_modular.py

# This is an example for the initialization of a linear kernel on string data. The
# strings are all of the same length and consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list=[[traindat,testdat],[traindat,testdat]]

def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):
	from modshogun import StringCharFeatures, DNA
	from modshogun import LinearStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=LinearStringKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	from tools.load import LoadMatrix
	print('LinearString')
	kernel_linear_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_linear_word_modular.py

# This is an example for the initialization of a linear kernel on word (2byte)
# data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import ushort

lm=LoadMatrix()
traindat = ushort(lm.load_numbers('../data/fm_train_word.dat'))
testdat = ushort(lm.load_numbers('../data/fm_test_word.dat'))

parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.2]]

def kernel_linear_word_modular (fm_train_word=traindat,fm_test_word=testdat,scale=1.2):

	from modshogun import LinearKernel, AvgDiagKernelNormalizer
	from modshogun import WordFeatures

	feats_train=WordFeatures(fm_train_word)
	feats_test=WordFeatures(fm_test_word)

	kernel=LinearKernel(feats_train, feats_train)
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return kernel

if __name__=='__main__':
	print('LinearWord')
	kernel_linear_word_modular(*parameter_list[0])

examples/documented/python_modular/kernel_local_alignment_string_modular.py

# This is an example for the initialization of the local alignment kernel on
# DNA sequences, where each column of the matrices of type char corresponds to
# one training/test example.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat],[traindat,testdat]]

def kernel_local_alignment_string_modular (fm_train_dna=traindat,fm_test_dna=testdat):

	from modshogun import StringCharFeatures, DNA
	from modshogun import LocalAlignmentStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=LocalAlignmentStringKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('LocalAlignmentString')
	kernel_local_alignment_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_locality_improved_string_modular.py

#  The LocalityImprovedString kernel is inspired by the polynomial kernel.
# Comparing neighboring characters it puts emphasize on local features.
# 
# It can be defined as
# K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2},
# where
# I_i({\bf x},{\bf x'})=1
# if $x_i=x'_i and 0 otherwise.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindat,testdat,5,5,7],[traindat,testdat,5,5,7]]

def kernel_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7):

	from modshogun import StringCharFeatures, DNA
	from modshogun import LocalityImprovedStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=LocalityImprovedStringKernel(
		feats_train, feats_train, length, inner_degree, outer_degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('LocalityImprovedString')
	kernel_locality_improved_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_log_modular.py

# In this example the log kernel (logarithm of the distance powered by degree plus one) is being computed for toy data.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]

def kernel_log_modular (train_fname=traindat,test_fname=testdat, degree=2.0):
	from modshogun import RealFeatures, LogKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=LogKernel(feats_train, feats_train, degree, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Log')
	kernel_log_modular(*parameter_list[0])

examples/documented/python_modular/kernel_match_word_string_modular.py

# In this example the match word string kernel is being computed for toy data

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat, 3,1.4,10,3,0,False],[
traindat,testdat, 3,1.4,10,3,0,False]]

def kernel_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False):
	from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer
	from modshogun import StringWordFeatures, StringCharFeatures, DNA

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	kernel=MatchWordStringKernel(size_cache, degree)
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('MatchWordString')
	kernel_match_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_multiquadric_modular.py

# In this example the multiquadric kernel is being computed for toy data.

#!/usr/bin/env python

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_multiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
	from modshogun import RealFeatures, MultiquadricKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=MultiquadricKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Multiquadric')
	kernel_multiquadric_modular(*parameter_list[0])

examples/documented/python_modular/kernel_oligo_string_modular.py

# This is an example initializing the oligo string kernel which takes distances
# between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length
# of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is
# implemented for the DNA-alphabet 'ACGT'.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,3,1.2,10],[traindat,testdat,4,1.3,10]]

def kernel_oligo_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,k=3,width=1.2,size_cache=10):
	from modshogun import StringCharFeatures, DNA
	from modshogun import OligoStringKernel

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=OligoStringKernel(size_cache, k, width)
	kernel.init(feats_train, feats_train)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('OligoString')
	kernel_oligo_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_poly_match_string_modular.py

# In this example the poly match string kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,3,False],[traindat,testdat,4,False]]
def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False):
	from modshogun import PolyMatchStringKernel
	from modshogun import StringCharFeatures, DNA

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_train_dna, DNA)

	kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('PolyMatchString')
	kernel_poly_match_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_poly_match_word_string_modular.py

# This is an example for the initialization of the PolyMatchString kernel on string data.
# The PolyMatchString kernel sums over the matches of two stings of the same length and
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding
# to the DNA-alphabet. Each column of the matrices of type char corresponds to
# one training/test example.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,2,True,3,0,False],[traindat,testdat,2,True,3,0,False]]

def kernel_poly_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
degree=2,inhomogene=True,order=3,gap=0,reverse=False):
	from modshogun import PolyMatchWordStringKernel
	from modshogun import StringWordFeatures, StringCharFeatures, DNA



	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(DNA)
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(DNA)
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('PolyMatchWordString')
	kernel_poly_match_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_poly_modular.py

# This example initializes the polynomial kernel with real data.
# If variable 'inhomogene' is 'True' +1 is added to the scalar product
# before taking it to the power of 'degree'. If 'use_normalization' is
# set to 'true' then kernel matrix will be normalized by the square roots
# of the diagonal entries.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat,4,False,True],[traindat,testdat,5,False,True]]

def kernel_poly_modular (train_fname=traindat,test_fname=testdat,degree=4,inhomogene=False,
	use_normalization=True):
	from modshogun import RealFeatures, PolyKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=PolyKernel(
		feats_train, feats_train, degree, inhomogene, use_normalization)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
if __name__=='__main__':
	print('Poly')
	kernel_poly_modular (*parameter_list[0])

examples/documented/python_modular/kernel_power_modular.py

# In this example the power kernel is being computed for toy data.

#!/usr/bin/env python

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]

def kernel_power_modular (train_fname=traindat,test_fname=testdat, degree=2.0):
	from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=PowerKernel(feats_train, feats_train, degree, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Power')
	kernel_power_modular(*parameter_list[0])

examples/documented/python_modular/kernel_rationalquadratic_modular.py

# In this example the rational quadratic kernel is being computed for toy data.

#!/usr/bin/env python

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
	from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('RationalQuadratic')
	kernel_rationalquadratic_modular(*parameter_list[0])

examples/documented/python_modular/kernel_salzberg_word_string_modular.py

# The SalzbergWordString kernel implements the Salzberg kernel.
# 
# It is described in
# 
# Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites
# A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')

parameter_list = [[traindat,testdat,label_traindat,3,0,False],[traindat,testdat,label_traindat,3,0,False]]
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,
order=3,gap=0,reverse=False):
	from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels
	from modshogun import SalzbergWordStringKernel
	from modshogun import PluginEstimate

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	pie=PluginEstimate()
	labels=BinaryLabels(label_train_dna)
	pie.set_labels(labels)
	pie.set_features(feats_train)
	pie.train()

	kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	pie.set_features(feats_test)
	pie.apply().get_labels()
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('PluginEstimate w/ SalzbergWord')
	kernel_salzberg_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_sigmoid_modular.py

# The standard Sigmoid kernel computed on dense real valued features.

#!/usr/bin/env python
traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'

parameter_list = [[traindat,testdat,10,1.2,1.3],[traindat,testdat,10,1.2,1.3]]

def kernel_sigmoid_modular (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3):
	from modshogun import RealFeatures, SigmoidKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('Sigmoid')
	kernel_sigmoid_modular(*parameter_list[0])

examples/documented/python_modular/kernel_simple_locality_improved_string_modular.py

# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,5,5,1],[traindat,testdat,5,3,2]]

def kernel_simple_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,
	length=5,inner_degree=5,outer_degree=1 ):

	from modshogun import StringCharFeatures, DNA
	from modshogun import SimpleLocalityImprovedStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)


	kernel=SimpleLocalityImprovedStringKernel(
		feats_train, feats_train, length, inner_degree, outer_degree)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('SimpleLocalityImprovedString')
	kernel_simple_locality_improved_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_sparse_gaussian_modular.py

# This example demonstrates how to use the Gaussian Kernel with sparse features.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]

def kernel_sparse_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.1 ):
	from modshogun import SparseRealFeatures
	from modshogun import GaussianKernel

	feats_train=SparseRealFeatures(fm_train_real)
	feats_test=SparseRealFeatures(fm_test_real)


	kernel=GaussianKernel(feats_train, feats_train, width)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('SparseGaussian')
	kernel_sparse_gaussian_modular (*parameter_list[0])

examples/documented/python_modular/kernel_sparse_linear_modular.py

# This example demonstrates how to use the Linear Kernel with sparse features.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]]

def kernel_sparse_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.1):
	from modshogun import SparseRealFeatures
	from modshogun import LinearKernel, AvgDiagKernelNormalizer

	feats_train=SparseRealFeatures(fm_train_real)
	feats_test=SparseRealFeatures(fm_test_real)

	kernel=LinearKernel()
	kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('SparseLinear')
	kernel_sparse_linear_modular(*parameter_list[0])

examples/documented/python_modular/kernel_sparse_poly_modular.py

# This example shows how to use the polynomial kernel with sparse features.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,10,3,True],[traindat,testdat,10,4,True]]

def kernel_sparse_poly_modular (fm_train_real=traindat,fm_test_real=testdat,
		 size_cache=10,degree=3,inhomogene=True ):

	from modshogun import SparseRealFeatures
	from modshogun import PolyKernel

	feats_train=SparseRealFeatures(fm_train_real)
	feats_test=SparseRealFeatures(fm_test_real)



	kernel=PolyKernel(feats_train, feats_train, size_cache,
		inhomogene, degree)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('SparsePoly')
	kernel_sparse_poly_modular(*parameter_list[0])

examples/documented/python_modular/kernel_spherical_modular.py

# In this example the spherical kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where

lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]]

def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0):
	from modshogun import RealFeatures
	from modshogun import MultiquadricKernel
	from modshogun import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Spherical')
	kernel_spherical_modular(*parameter_list[0])

examples/documented/python_modular/kernel_spline_modular.py

# In this example the spline kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where

lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list=[[traindat,testdat],[traindat,testdat]]

def kernel_spline_modular (fm_train_real=traindat,fm_test_real=testdat):
	from modshogun import RealFeatures
	from modshogun import SplineKernel

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	kernel=SplineKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Spline')
	kernel_spline_modular(*parameter_list[0])

examples/documented/python_modular/kernel_ssk_string_modular.py

#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2014 Soumyajit De
#

#!/usr/bin/env python

from tools.load import LoadMatrix

lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,2,0.75],[traindat,testdat,3,0.75]]

def kernel_ssk_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, maxlen=1, decay=1):
	from modshogun import SubsequenceStringKernel
	from modshogun import StringCharFeatures, DNA

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay)

	km_train=kernel.get_kernel_matrix()
	# print(km_train)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	# print(km_test)
	return km_train,km_test,kernel

if __name__=='__main__':
	print('SubsequenceStringKernel DNA')
	kernel_ssk_string_modular(*parameter_list[0])
	kernel_ssk_string_modular(*parameter_list[1])

examples/documented/python_modular/kernel_top_modular.py

# The class TOPFeatures implements TOP kernel features obtained from
# two Hidden Markov models.
# 
# It was used in
# 
# K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new
# discriminative kernel from probabilistic models. Neural Computation,
# 14:2397-2414, 2002.
# 
# which also has the details.
# 
# Note that TOP-features are computed on the fly, so to be effective feature
# caching should be enabled.
# 
# It inherits its functionality from CSimpleFeatures, which should be
# consulted for further reference.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')

fm_hmm_pos=[traindat[i] for i in where([label_traindat==1])[1] ]
fm_hmm_neg=[traindat[i] for i in where([label_traindat==-1])[1] ]

parameter_list = [[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True]], \
[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True] ]]

def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1,
	order=1,gap=0,reverse=False,kargs=[1, False, True]):
	from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA
	from modshogun import PolyKernel
	from modshogun import HMM, BW_NORMAL

	N=1 # toy HMM with 1 state
	M=4 # 4 observations -> DNA


	# train HMM for positive class
	charfeat=StringCharFeatures(fm_hmm_pos, DNA)
	hmm_pos_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	pos=HMM(hmm_pos_train, N, M, pseudo)
	pos.baum_welch_viterbi_train(BW_NORMAL)

	# train HMM for negative class
	charfeat=StringCharFeatures(fm_hmm_neg, DNA)
	hmm_neg_train=StringWordFeatures(charfeat.get_alphabet())
	hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	neg=HMM(hmm_neg_train, N, M, pseudo)
	neg.baum_welch_viterbi_train(BW_NORMAL)

	# Kernel training data
	charfeat=StringCharFeatures(fm_train_dna, DNA)
	wordfeats_train=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# Kernel testing data
	charfeat=StringCharFeatures(fm_test_dna, DNA)
	wordfeats_test=StringWordFeatures(charfeat.get_alphabet())
	wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	# get kernel on training data
	pos.set_observations(wordfeats_train)
	neg.set_observations(wordfeats_train)
	feats_train=TOPFeatures(10, pos, neg, False, False)
	kernel=PolyKernel(feats_train, feats_train, *kargs)
	km_train=kernel.get_kernel_matrix()

	# get kernel on testing data
	pos_clone=HMM(pos)
	neg_clone=HMM(neg)
	pos_clone.set_observations(wordfeats_test)
	neg_clone.set_observations(wordfeats_test)
	feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False)
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print("TOP Kernel")
	kernel_top_modular(*parameter_list[0])

examples/documented/python_modular/kernel_tstudent_modular.py

# In this example the t-Student's kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where

lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]]

def kernel_tstudent_modular (fm_train_real=traindat,fm_test_real=testdat, degree=2.0):
	from modshogun import RealFeatures
	from modshogun import TStudentKernel
	from modshogun import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=TStudentKernel(feats_train, feats_train, degree, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('TStudent')
	kernel_tstudent_modular(*parameter_list[0])

examples/documented/python_modular/kernel_wave_modular.py

# In this example the wave kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where

lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]]

def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0):
	from modshogun import RealFeatures
	from modshogun import WaveKernel
	from modshogun import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=WaveKernel(feats_train, feats_train, theta, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Wave')
	kernel_wave_modular(*parameter_list[0])

examples/documented/python_modular/kernel_wavelet_modular.py

# In this example the wavelet kernel is being computed for toy data.

#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import where

lm=LoadMatrix()
traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list=[[traindat,testdat, 1.5, 1.0],[traindat,testdat, 1.0, 1.5]]

def kernel_wavelet_modular (fm_train_real=traindat,fm_test_real=testdat, dilation=1.5, translation=1.0):
	from modshogun import RealFeatures
	from modshogun import WaveletKernel

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	kernel=WaveletKernel(feats_train, feats_train, 10, dilation, translation)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel


if __name__=='__main__':
	print('Wavelet')
	kernel_wavelet_modular(*parameter_list[0])

examples/documented/python_modular/kernel_weighted_comm_word_string_modular.py

# The WeightedCommWordString kernel may be used to compute the weighted
# spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer
# length is weighted by some coefficient \f$\beta_k\f$) from strings that have
# been mapped into unsigned 16bit integers.
# 
# These 16bit integers correspond to k-mers. To applicable in this kernel they
# need to be sorted (e.g. via the SortWordString pre-processor).
# 
# It basically uses the algorithm in the unix "comm" command (hence the name)
# to compute:
# 
# k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'})
# 
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
# 
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
# 
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat],[traindat,testdat]]

def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ):
	from modshogun import WeightedCommWordStringKernel
	from modshogun import StringWordFeatures, StringCharFeatures, DNA
	from modshogun import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	use_sign=False
	kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('WeightedCommWordString')
	kernel_weighted_comm_word_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_weighted_degree_position_string_modular.py

# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
# 
# The WD-shift kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d, weighted by coefficients beta_k
# allowing for a positional tolerance of up to shift s.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,20],[traindat,testdat,22]]
def kernel_weighted_degree_position_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
	from modshogun import StringCharFeatures, DNA
	from modshogun import WeightedDegreePositionStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree)

	from numpy import zeros,ones,float64,int32
	kernel.set_shifts(10*ones(len(fm_train_dna[0]), dtype=int32))
	kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('WeightedDegreePositionString')
	kernel_weighted_degree_position_string_modular(*parameter_list[0])

examples/documented/python_modular/kernel_weighted_degree_string_modular.py

# This examples shows how to create a Weighted Degree String Kernel from data
# and how to compute the kernel matrix from the resulting object.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindat,testdat,3],[traindat,testdat,20]]

def kernel_weighted_degree_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20):
	from modshogun import StringCharFeatures, DNA
	from modshogun import WeightedDegreeStringKernel, MSG_DEBUG

	feats_train=StringCharFeatures(fm_train_dna, DNA)
	#feats_train.io.set_loglevel(MSG_DEBUG)
	feats_test=StringCharFeatures(fm_test_dna, DNA)

	kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree)

	from numpy import arange,double
	weights=arange(1,degree+1,dtype=double)[::-1]/ \
		sum(arange(1,degree+1,dtype=double))
	kernel.set_wd_weights(weights)
	#from numpy import ones,float64,int32
	#kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64))

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

    #this is how to serializate the kernel
	#import pickle
	#pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2)
	#k=pickle.load(file('tmp/kernel_obj.dump','r'))


	return km_train, km_test, kernel


if __name__=='__main__':
	print('WeightedDegreeString')
	kernel_weighted_degree_string_modular(*parameter_list[0])

Labels

examples/documented/python_modular/labels_io_modular.py

#!/usr/bin/env python
parameter_list=[[]]

def labels_io_modular():
	from modshogun import RegressionLabels, CSVFile
	lab=RegressionLabels()
	f=CSVFile("../data/label_train_regression.dat","r")
	f.set_delimiter(" ")
	lab.load(f)
	#print lab.get_labels()
	return lab

if __name__=='__main__':
	print('Labels IO')
	labels_io_modular(*parameter_list[0])

Library

examples/documented/python_modular/library_fisher2x3_modular.py

#!/usr/bin/env python
from numpy import *
from modshogun import *

x=array([[20.0,15,15],[10,20,20]])
y=array([[21.0,21,18],[19,19,22]])
z=array([[15.0,27,18],[32,5,23]])


parameter_list = [[x,concatenate((x,y,z),1)]]

def library_fisher2x3_modular (table, tables):
	pval=Statistics_fishers_exact_test_for_2x3_table(table)
	pvals=Statistics_fishers_exact_test_for_multiple_2x3_tables(tables)
	return (pval,pvals)

if __name__=='__main__':
	print('Fisher 2x3')
	library_fisher2x3_modular(*parameter_list[0])

examples/documented/python_modular/library_time.py

#!/usr/bin/env python
import time
from modshogun import Time

parameter_list = [[5],[1.0]]
def library_time (sleep_secs):
	# measure wall clock time difference
	t=Time()
	time.sleep(sleep_secs)
	diff=t.cur_time_diff()

	# measure CPU time required
	cpu_diff=t.cur_runtime_diff_sec()

	# wall clock time should be above sleep_secs
	# but cpu time should be tiny
	#print diff, cpu_diff
	return diff>sleep_secs, cpu_diff<0.5

if __name__=='__main__':
	print('Time')
	library_time(*parameter_list[0])

Mathematics

examples/documented/python_modular/mathematics_linsolver_cg.py

#!/usr/bin/env python

import numpy
from scipy.io import mmread

# Loading an example sparse matrix of dimension 479x479, real, unsymmetric
mtx=mmread('../../../data/logdet/west0479.mtx')

parameter_list=[[mtx,6000,10]]

def mathematics_linsolver_cg (matrix=mtx,max_iter=1000,seed=10):

	# Create a Hermitian sparse matrix
	from scipy.sparse import eye

	rows=matrix.shape[0]
	cols=matrix.shape[1]
	A=matrix.transpose()*matrix+eye(rows, cols)

	# Create a random vector (b) of the system Ax=b
	numpy.random.seed(seed)
	b=numpy.array(numpy.random.randn(rows))

	# create linear system with linear operator and vector
	from scipy.sparse import csc_matrix

	try:
		from shogun.Mathematics import RealSparseMatrixOperator
		from shogun.Mathematics import ConjugateGradientSolver

		op=RealSparseMatrixOperator(A.tocsc())
		solver=ConjugateGradientSolver()

		# set the iteration limit higher for poorly conditioned matrices
		solver.set_iteration_limit(max_iter)
		x=solver.solve(op, b)

		# verifying the solution via direct solving
		from scipy.sparse.linalg import spsolve, eigsh
		y=spsolve(A,b)
		print(numpy.linalg.norm(x-y))

		return x

	except ImportError:
		print('Shogun not installed with Eigen3!')

if __name__=='__main__':
	print('CG')
	mathematics_linsolver_cg (*parameter_list[0])

examples/documented/python_modular/mathematics_logdet.py

#!/usr/bin/env python

from numpy import *
from scipy.io import mmread

# Loading an example sparse matrix of dimension 479x479, real, unsymmetric
mtx=mmread('../../../data/logdet/west0479.mtx')

parameter_list=[[mtx,100,60,1]]

def mathematics_logdet (matrix=mtx,max_iter_eig=1000,max_iter_lin=1000,num_samples=1):

	from scipy.sparse import eye

	# Create a Hermitian sparse matrix
	rows=matrix.shape[0]
	cols=matrix.shape[1]
	A=matrix.transpose()*matrix+eye(rows, cols)

	from scipy.sparse import csc_matrix

	try:
		from shogun.Mathematics import RealSparseMatrixOperator
		from shogun.Mathematics import LanczosEigenSolver
		from shogun.Mathematics import CGMShiftedFamilySolver
		from shogun.Mathematics import LogRationalApproximationCGM
		from shogun.Mathematics import ProbingSampler
		from shogun.Mathematics import LogDetEstimator
		from shogun.Mathematics import Statistics
		from shogun.Library import SerialComputationEngine

		# creating the linear operator, eigen-solver
		op=RealSparseMatrixOperator(A.tocsc())

		eig_solver=LanczosEigenSolver(op)

		# we can set the iteration limit high for poorly conditioned matrices
		eig_solver.set_max_iteration_limit(max_iter_eig)

		# alternatively, if the matrix is small, we can compute eigenvalues externally
		# and set min/max eigenvalues into the eigensolver
		# from scipy.sparse.linalg import eigsh

		# eigenvalues=eigsh(A, rows-1)
		# eig_solver.set_min_eigenvalue(eigenvalues[0][0])
		# eig_solver.set_max_eigenvalue(eigenvalues[0][-1])

		# create the shifted-family linear solver which solves for all the shifts
		# using as many matrix-vector products as one shift in CG iterations
		lin_solver=CGMShiftedFamilySolver()
		lin_solver.set_iteration_limit(max_iter_lin)

		# computation engine
		engine=SerialComputationEngine()

		# set the desired accuracy tighter to obtain better results
		# this determines the number of contour points in conformal mapping of
		# the rational approximation of the Cauchy's integral of f(A)*s, f=log
		desired_accuracy=1E-5

		# creating the log-linear-operator function
		op_func=LogRationalApproximationCGM(op, engine, eig_solver, lin_solver,\
			desired_accuracy)

		# set the trace sampler to be probing sampler, in which samples are obtained
		# by greedy graph coloring of the power of sparse matrix (default is power=1,
		# 2-distance coloring)
		trace_sampler=ProbingSampler(op)

		# estimating log-det
		log_det_estimator=LogDetEstimator(trace_sampler, op_func, engine)

		# set the number of samples as required
		estimates=log_det_estimator.sample(num_samples)

		estimated_logdet=sum(estimates)/len(estimates)
		actual_logdet=Statistics.log_det(A)

		print(actual_logdet, estimated_logdet)

		return estimates

	except ImportError:
		print('One or many of the dependencies (Eigen3/LaPack/ColPack) not found!')

if __name__=='__main__':
	print('LogDetEstimator')
	mathematics_logdet (*parameter_list[0])

examples/documented/python_modular/mathematics_sparseinversecovariance_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix

lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data,0.0],[data,1.0]]

def mathematics_sparseinversecovariance_modular (data,lc):
	try:
		from modshogun import SparseInverseCovariance
	except ImportError:
		print("SparseInverseCovariance not available")
		exit(0)
	
	from numpy import dot

	sic = SparseInverseCovariance()
	S = dot(data,data.T)
	Si = sic.estimate(S,lc)

	return Si


if __name__=='__main__':
	print('SparseInverseCovariance')
	mathematics_sparseinversecovariance_modular(*parameter_list[0])

Metric

examples/documented/python_modular/metric_lmnn_modular.py

#!/usr/bin/env python

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

parameter_list = [[traindat,testdat,label_traindat,3]]

def metric_lmnn_modular(train_fname=traindat,test_fname=testdat,label_train_fname=label_traindat,k=3):
	try:
		from modshogun import RealFeatures,MulticlassLabels,LMNN,KNN,CSVFile
	except ImportError:
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=MulticlassLabels(CSVFile(label_train_fname))

	# LMNN
	lmnn=LMNN(feats_train,labels,k)
	lmnn.train()
	lmnn_distance=lmnn.get_distance()

	# perform classification with KNN
	knn=KNN(k,lmnn_distance,labels)
	knn.train()
	output=knn.apply(feats_test).get_labels()

	return lmnn,output

if __name__=='__main__':
	print('LMNN')
	metric_lmnn_modular(*parameter_list[0])

Mkl

examples/documented/python_modular/mkl_binclass_modular.py

# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface. First, we create a number of base kernels.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLClassifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time.
# Finally, the example shows how to classify with a trained MKLClassifier.
# 

#!/usr/bin/env python
from modshogun import CombinedFeatures, RealFeatures, BinaryLabels
from modshogun import CombinedKernel, PolyKernel, CustomKernel
from modshogun import MKLClassification
from tools.load import LoadMatrix
lm=LoadMatrix()

#only run example if SVMLight is included as LibSVM solver crashes in MKLClassification
try:
	from modshogun import SVMLight
except ImportError:
	print("SVMLight not available")
	exit(0)

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]]
#    fm_train_real.shape
#    fm_test_real.shape
#    combined_custom()

def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat):

    ##################################
    # set up and train

    # create some poly train/test matrix
    tfeats = RealFeatures(fm_train_real)
    tkernel = PolyKernel(10,3)
    tkernel.init(tfeats, tfeats)
    K_train = tkernel.get_kernel_matrix()

    pfeats = RealFeatures(fm_test_real)
    tkernel.init(tfeats, pfeats)
    K_test = tkernel.get_kernel_matrix()

    # create combined train features
    feats_train = CombinedFeatures()
    feats_train.append_feature_obj(RealFeatures(fm_train_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_train))
    kernel.append_kernel(PolyKernel(10,2))
    kernel.init(feats_train, feats_train)

    # train mkl
    labels = BinaryLabels(fm_label_twoclass)
    mkl = MKLClassification()

    # which norm to use for MKL
    mkl.set_mkl_norm(1) #2,3

    # set cost (neg, pos)
    mkl.set_C(1, 1)

    # set kernel and labels
    mkl.set_kernel(kernel)
    mkl.set_labels(labels)

    # train
    mkl.train()
    #w=kernel.get_subkernel_weights()
    #kernel.set_subkernel_weights(w)


    ##################################
    # test

    # create combined test features
    feats_pred = CombinedFeatures()
    feats_pred.append_feature_obj(RealFeatures(fm_test_real))

    # and corresponding combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(CustomKernel(K_test))
    kernel.append_kernel(PolyKernel(10, 2))
    kernel.init(feats_train, feats_pred)

    # and classify
    mkl.set_kernel(kernel)
    mkl.apply()
    return mkl.apply(),kernel

if __name__=='__main__':
    mkl_binclass_modular (*parameter_list[0])

examples/documented/python_modular/mkl_multiclass_modular.py

# In this example we show how to perform Multiple Kernel Learning (MKL)
# with the modular interface for multi-class classification.
# First, we create a number of base kernels and features.
# These kernels can capture different views of the same features, or actually
# consider entirely different features associated with the same example
# (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample).
# The base kernels are then subsequently added to a CombinedKernel, which
# contains a weight for each kernel and encapsulates the base kernels
# from the training procedure. When the CombinedKernel between two examples is
# evaluated it computes the corresponding linear combination of kernels according to their weights.
# We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal
# weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary
# classification version of MKL is that we can use more than two values as labels, when training
# the classifier.
# Finally, the example shows how to classify with a trained MKLMultiClass classifier.
# 

#!/usr/bin/env python
from tools.load import LoadMatrix
lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')

parameter_list=[
		[ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5],
		[ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2]]

def mkl_multiclass_modular (fm_train_real, fm_test_real, label_train_multiclass,
	width, C, epsilon, num_threads, mkl_epsilon, mkl_norm):

	from modshogun import CombinedFeatures, RealFeatures, MulticlassLabels
	from modshogun import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel
	from modshogun import MKLMulticlass

	kernel = CombinedKernel()
	feats_train = CombinedFeatures()
	feats_test = CombinedFeatures()

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = GaussianKernel(10, width)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = LinearKernel()
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train = RealFeatures(fm_train_real)
	subkfeats_test = RealFeatures(fm_test_real)
	subkernel = PolyKernel(10,2)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)

	labels = MulticlassLabels(label_train_multiclass)

	mkl = MKLMulticlass(C, kernel, labels)

	mkl.set_epsilon(epsilon);
	mkl.parallel.set_num_threads(num_threads)
	mkl.set_mkl_epsilon(mkl_epsilon)
	mkl.set_mkl_norm(mkl_norm)

	mkl.train()

	kernel.init(feats_train, feats_test)

	out =  mkl.apply().get_labels()
	return out

if __name__ == '__main__':
	print('mkl_multiclass')
	mkl_multiclass_modular(*parameter_list[0])

Modelselection

examples/documented/python_modular/modelselection_grid_search_kernel.py

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#

from numpy import array
from numpy import random
import math

from modshogun import CrossValidation, CrossValidationResult
from modshogun import ContingencyTableEvaluation, ACCURACY
from modshogun import StratifiedCrossValidationSplitting
from modshogun import BinaryLabels
from modshogun import RealFeatures
from modshogun import GaussianKernel, PowerKernel
from modshogun import LibSVM
from modshogun import MinkowskiMetric
from modshogun import GridSearchModelSelection
from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
from modshogun import ParameterCombination
from modshogun import Math

def create_param_tree():
	root=ModelSelectionParameters()

	c1=ModelSelectionParameters("C1")
	root.append_child(c1)
	c1.build_values(-1.0, 1.0, R_EXP)

	c2=ModelSelectionParameters("C2")
	root.append_child(c2)
	c2.build_values(-1.0, 1.0, R_EXP)

	gaussian_kernel=GaussianKernel()

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#gaussian_kernel.print_modsel_params()

	param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel)
	gaussian_kernel_width=ModelSelectionParameters("log_width")
	gaussian_kernel_width.build_values(-math.log(2.0), 0.0, R_EXP, 1.0, 2.0)
	param_gaussian_kernel.append_child(gaussian_kernel_width)
	root.append_child(param_gaussian_kernel)

	power_kernel=PowerKernel()

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#power_kernel.print_modsel_params()

	param_power_kernel=ModelSelectionParameters("kernel", power_kernel)
	root.append_child(param_power_kernel)

	param_power_kernel_degree=ModelSelectionParameters("degree")
	param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR)
	param_power_kernel.append_child(param_power_kernel_degree)

	metric=MinkowskiMetric(10)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#metric.print_modsel_params()

	param_power_kernel_metric1=ModelSelectionParameters("distance", metric)

	param_power_kernel.append_child(param_power_kernel_metric1)

	param_power_kernel_metric1_k=ModelSelectionParameters("k")
	param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR)
	param_power_kernel_metric1.append_child(param_power_kernel_metric1_k)

	return root

parameter_list = [[3,20,3]]

def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	# create some (non-sense) data
	matrix=random.rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	#param_tree.print_tree()

	grid_search=GridSearchModelSelection(cross, param_tree)

	print_state=False
	best_combination=grid_search.select_model(print_state)
	#print("best parameter(s):")
	#best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have less variance
	cross.set_num_runs(10)
	result=cross.evaluate()
	casted=CrossValidationResult.obtain_from_generic(result);
	#print "result mean:", casted.mean

	return classifier,result,casted.mean

if __name__=='__main__':
	print('ModelselectionGridSearchKernel')
	modelselection_grid_search_kernel(*parameter_list[0])

examples/documented/python_modular/modelselection_grid_search_krr_modular.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#

from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')


parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \
                 [traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]]

def modelselection_grid_search_krr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
				       width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import MeanSquaredError
    from modshogun import CrossValidationSplitting
    from modshogun import RegressionLabels
    from modshogun import RealFeatures
    from modshogun import KernelRidgeRegression
    from modshogun import GridSearchModelSelection
    from modshogun import ModelSelectionParameters

    # training data
    features_train=RealFeatures(traindat)
    features_test=RealFeatures(testdat)
    labels=RegressionLabels(label_traindat)

    # labels
    labels=RegressionLabels(label_train)

    # predictor, set tau=0 here, doesnt matter
    predictor=KernelRidgeRegression()

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=CrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=MeanSquaredError()

    # cross-validation instance
    cross_validation=CrossValidation(predictor, features_train, labels,
	    splitting_strategy, evaluation_criterium)

    # (optional) repeat x-val (set larger to get better estimates)
    cross_validation.set_num_runs(2)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #predictor.print_modsel_params()

    # build parameter tree to select regularization parameter
    param_tree_root=create_param_tree()

    # model selection instance
    model_selection=GridSearchModelSelection(cross_validation, param_tree_root)

    # perform model selection with selected methods
    #print "performing model selection of"
    #print "parameter tree:"
    #param_tree_root.print_tree()

    #print "starting model selection"
    # print the current parameter combination, if no parameter nothing is printed
    print_state=False

    best_parameters=model_selection.select_model(print_state)

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(predictor)
    result=cross_validation.evaluate()
    #print "mean:", result.mean

# creates all the parameters to optimize
def create_param_tree():
    from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
    from modshogun import ParameterCombination
    from modshogun import GaussianKernel, PolyKernel
    import math
    root=ModelSelectionParameters()

    tau=ModelSelectionParameters("tau")
    root.append_child(tau)

    # also R_LINEAR/R_LOG is available as type
    min=-1
    max=1
    type=R_EXP
    step=1.5
    base=2
    tau.build_values(min, max, type, step, base)

    # gaussian kernel with width
    gaussian_kernel=GaussianKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #gaussian_kernel.print_modsel_params()

    param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel)
    gaussian_kernel_width=ModelSelectionParameters("log_width");
    gaussian_kernel_width.build_values(2.0*math.log(2.0), 2.5*math.log(2.0), R_LINEAR, 1.0)
    param_gaussian_kernel.append_child(gaussian_kernel_width)
    root.append_child(param_gaussian_kernel)

    # polynomial kernel with degree
    poly_kernel=PolyKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #poly_kernel.print_modsel_params()

    param_poly_kernel=ModelSelectionParameters("kernel", poly_kernel)

    root.append_child(param_poly_kernel)

    # note that integers are used here
    param_poly_kernel_degree=ModelSelectionParameters("degree")
    param_poly_kernel_degree.build_values(1, 2, R_LINEAR)
    param_poly_kernel.append_child(param_poly_kernel_degree)

    return root


if __name__=='__main__':
	print('ModelselectionGridSearchKRR')
	modelselection_grid_search_krr_modular(*parameter_list[0])

examples/documented/python_modular/modelselection_grid_search_liblinear_modular.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2011 Heiko Strathmann
# Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
#

from numpy.random import randn
from numpy import *

# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
	randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));

parameter_list = [[traindat,label_traindat]]

def modelselection_grid_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import ContingencyTableEvaluation, ACCURACY
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import GridSearchModelSelection
    from modshogun import ModelSelectionParameters, R_EXP
    from modshogun import ParameterCombination
    from modshogun import BinaryLabels
    from modshogun import RealFeatures
    from modshogun import LibLinear, L2R_L2LOSS_SVC

    # build parameter tree to select C1 and C2
    param_tree_root=ModelSelectionParameters()
    c1=ModelSelectionParameters("C1");
    param_tree_root.append_child(c1)
    c1.build_values(-1.0, 0.0, R_EXP);

    c2=ModelSelectionParameters("C2");
    param_tree_root.append_child(c2);
    c2.build_values(-1.0, 0.0, R_EXP);

    # training data
    features=RealFeatures(traindat)
    labels=BinaryLabels(label_traindat)

    # classifier
    classifier=LibLinear(L2R_L2LOSS_SVC)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #classifier.print_modsel_params()

    # splitting strategy for cross-validation
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 10)

    # evaluation method
    evaluation_criterium=ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation=CrossValidation(classifier, features, labels,
                                     splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # model selection instance
    model_selection=GridSearchModelSelection(cross_validation, param_tree_root)

    # perform model selection with selected methods
    #print "performing model selection of"
    #param_tree_root.print_tree()
    best_parameters=model_selection.select_model()

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(classifier)
    result=cross_validation.evaluate()
    #result.print_result()

if __name__=='__main__':
    print('ModelSelectionGridSearchLibLinear')
    modelselection_grid_search_liblinear_modular(*parameter_list[0])

examples/documented/python_modular/modelselection_grid_search_libsvr_modular.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2012 Heiko Strathmann
# Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society
#

from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')


parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \
                 [traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]]

def modelselection_grid_search_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\
				       width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import MeanSquaredError
    from modshogun import CrossValidationSplitting
    from modshogun import RegressionLabels
    from modshogun import RealFeatures
    from modshogun import GaussianKernel
    from modshogun import LibSVR
    from modshogun import GridSearchModelSelection
    from modshogun import ModelSelectionParameters, R_EXP
    from modshogun import ParameterCombination

    # training data
    features_train=RealFeatures(traindat)
    labels=RegressionLabels(label_traindat)

    # kernel
    kernel=GaussianKernel(features_train, features_train, width)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #kernel.print_modsel_params()

    labels=RegressionLabels(label_train)

    # predictor
    predictor=LibSVR(C, tube_epsilon, kernel, labels)
    predictor.set_epsilon(epsilon)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=CrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium=MeanSquaredError()

    # cross-validation instance
    cross_validation=CrossValidation(predictor, features_train, labels,
	    splitting_strategy, evaluation_criterium)

#	 (optional) repeat x-val (set larger to get better estimates)
    cross_validation.set_num_runs(2)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #predictor.print_modsel_params()

    # build parameter tree to select C1 and C2
    param_tree_root=ModelSelectionParameters()
    c1=ModelSelectionParameters("C1");
    param_tree_root.append_child(c1)
    c1.build_values(-1.0, 0.0, R_EXP);

    c2=ModelSelectionParameters("C2");
    param_tree_root.append_child(c2);
    c2.build_values(-1.0, 0.0, R_EXP);

    # model selection instance
    model_selection=GridSearchModelSelection(cross_validation, param_tree_root)

    # perform model selection with selected methods
    #print "performing model selection of"
    #print "parameter tree"
    #param_tree_root.print_tree()

    #print "starting model selection"
    # print the current parameter combination, if no parameter nothing is printed
    print_state=False
    # lock data before since model selection will not change the kernel matrix
    # (use with care) This avoids that the kernel matrix is recomputed in every
    # iteration of the model search
    predictor.data_lock(labels, features_train)
    best_parameters=model_selection.select_model(print_state)

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(predictor)
    result=cross_validation.evaluate()
    #print "mean:", result.mean

if __name__=='__main__':
	print('ModelselectionGridSearchLibSVR')
	modelselection_grid_search_libsvr_modular(*parameter_list[0])

examples/documented/python_modular/modelselection_parameter_tree_modular.py

# In this example a complex model parameters selection tree
# is being constructed

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Written (W) 2011-2012 Heiko Strathmann
# Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
#

parameter_list=[[None]]

def modelselection_parameter_tree_modular (dummy):
    from modshogun import ParameterCombination
    from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR
    from modshogun import PowerKernel
    from modshogun import GaussianKernel
    from modshogun import DistantSegmentsKernel
    from modshogun import MinkowskiMetric
    import math

    root=ModelSelectionParameters()

    combinations=root.get_combinations()
    combinations.get_num_elements()

    c=ModelSelectionParameters('C');
    root.append_child(c)
    c.build_values(1, 11, R_EXP)

    power_kernel=PowerKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #power_kernel.print_modsel_params()

    param_power_kernel=ModelSelectionParameters('kernel', power_kernel)
    root.append_child(param_power_kernel)

    param_power_kernel_degree=ModelSelectionParameters('degree')
    param_power_kernel_degree.build_values(1, 1, R_EXP)
    param_power_kernel.append_child(param_power_kernel_degree)

    metric1=MinkowskiMetric(10)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #metric1.print_modsel_params()

    param_power_kernel_metric1=ModelSelectionParameters('distance', metric1)

    param_power_kernel.append_child(param_power_kernel_metric1)

    param_power_kernel_metric1_k=ModelSelectionParameters('k')
    param_power_kernel_metric1_k.build_values(1, 12, R_LINEAR)
    param_power_kernel_metric1.append_child(param_power_kernel_metric1_k)

    gaussian_kernel=GaussianKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #gaussian_kernel.print_modsel_params()

    param_gaussian_kernel=ModelSelectionParameters('kernel', gaussian_kernel)

    root.append_child(param_gaussian_kernel)

    param_gaussian_kernel_width=ModelSelectionParameters('log_width')
    param_gaussian_kernel_width.build_values(0.0, 0.5*math.log(2.0), R_LINEAR)
    param_gaussian_kernel.append_child(param_gaussian_kernel_width)

    ds_kernel=DistantSegmentsKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #ds_kernel.print_modsel_params()

    param_ds_kernel=ModelSelectionParameters('kernel', ds_kernel)

    root.append_child(param_ds_kernel)

    param_ds_kernel_delta=ModelSelectionParameters('delta')
    param_ds_kernel_delta.build_values(1, 2, R_EXP)
    param_ds_kernel.append_child(param_ds_kernel_delta)

    param_ds_kernel_theta=ModelSelectionParameters('theta')
    param_ds_kernel_theta.build_values(1, 2, R_EXP)
    param_ds_kernel.append_child(param_ds_kernel_theta)

    #	root.print_tree()
    combinations=root.get_combinations()
    #	for i in range(combinations.get_num_elements()):
    #		params = ParameterCombination.obtain_from_generic(combinations.get_element(i))
    #		params.print_tree()

    return


if __name__=='__main__':
    print('ModelSelection ParameterTree')
    modelselection_parameter_tree_modular(*parameter_list[0])

examples/documented/python_modular/modelselection_random_search_liblinear_modular.py

#!/usr/bin/env python
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Copyright (C) 2012 Sergey Lisitsyn

from numpy import *
from numpy.random import randn

# generate some overlapping training vectors
num_vectors=100
vec_distance=1
traindat=concatenate((randn(2,num_vectors)-vec_distance,
	randn(2,num_vectors)+vec_distance), axis=1)
label_traindat=concatenate((-ones(num_vectors), ones(num_vectors)));

parameter_list = [[traindat,label_traindat]]

def modelselection_random_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import ContingencyTableEvaluation, ACCURACY
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import RandomSearchModelSelection
    from modshogun import ModelSelectionParameters, R_EXP
    from modshogun import ParameterCombination
    from modshogun import BinaryLabels
    from modshogun import RealFeatures
    from modshogun import LibLinear, L2R_L2LOSS_SVC

    # build parameter tree to select C1 and C2
    param_tree_root=ModelSelectionParameters()
    c1=ModelSelectionParameters("C1");
    param_tree_root.append_child(c1)
    c1.build_values(-2.0, 2.0, R_EXP);

    c2=ModelSelectionParameters("C2");
    param_tree_root.append_child(c2);
    c2.build_values(-2.0, 2.0, R_EXP);

    # training data
    features=RealFeatures(traindat)
    labels=BinaryLabels(label_traindat)

    # classifier
    classifier=LibLinear(L2R_L2LOSS_SVC)

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #classifier.print_modsel_params()

    # splitting strategy for cross-validation
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 10)

    # evaluation method
    evaluation_criterium=ContingencyTableEvaluation(ACCURACY)

    # cross-validation instance
    cross_validation=CrossValidation(classifier, features, labels,
                                     splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # model selection instance
    model_selection=RandomSearchModelSelection(cross_validation, param_tree_root, 0.5)

    # perform model selection with selected methods
    #print "performing model selection of"
    #param_tree_root.print_tree()
    best_parameters=model_selection.select_model()

    # print best parameters
    #print "best parameters:"
    #best_parameters.print_tree()

    # apply them and print result
    best_parameters.apply_to_machine(classifier)
    result=cross_validation.evaluate()
    #result.print_result()

if __name__=='__main__':
    print('ModelSelectionRandomSearchLibLinear')
    modelselection_random_search_liblinear_modular(*parameter_list[0])

Multiclass

examples/documented/python_modular/multiclass_c45classifiertree_modular.py

#!/usr/bin/env python
from numpy import array

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])

parameter_list = [[traindat,testdat,label_traindat,feattypes]]

def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
		from numpy import random, int32
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
	subset=int32(random.permutation(feats_train.get_num_vectors()))
	vsubset=subset[1:subset.size/3]
	trsubset=subset[1+subset.size/3:subset.size]

	# C4.5 Tree formation using training subset
	train_labels.add_subset(trsubset)
	feats_train.add_subset(trsubset)

	c=C45ClassifierTree()
	c.set_labels(train_labels)
	c.set_feature_types(ft)
	c.train(feats_train)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# prune tree using validation subset
	train_labels.add_subset(vsubset)
	feats_train.add_subset(vsubset)

	c.prune_tree(feats_train,train_labels)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# Classify test data
	output=c.apply_multiclass(feats_test).get_labels()
	output_certainty=c.get_certainty_vector()

	return c,output,output_certainty

if __name__=='__main__':
	print('C45ClassifierTree')
	multiclass_c45classifiertree_modular(*parameter_list[0])

examples/documented/python_modular/multiclass_cartree_modular.py

#!/usr/bin/env python
from numpy import array

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])

parameter_list = [[traindat,testdat,label_traindat,feattypes]]

def multiclass_cartree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, CARTree, PT_MULTICLASS
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# CART Tree formation with 5 fold cross-validation pruning
	c=CARTree(ft,PT_MULTICLASS,5,True)
	c.set_labels(train_labels)
	c.train(feats_train)

	# Classify test data
	output=c.apply_multiclass(feats_test).get_labels()

	return c,output

if __name__=='__main__':
	print('CARTree')
	multiclass_cartree_modular(*parameter_list[0])

examples/documented/python_modular/multiclass_chaidtree_modular.py

#!/usr/bin/env python
from numpy import array, dtype, int32

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

# set both input attributes as continuous i.e. 2
feattypes = array([2, 2],dtype=int32)

parameter_list = [[traindat,testdat,label_traindat,feattypes]]

def multiclass_chaidtree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# CHAID Tree formation with nominal dependent variable
	c=CHAIDTree(0,feattypes,10)
	c.set_labels(train_labels)
	c.train(feats_train)

	# Classify test data
	output=c.apply_multiclass(feats_test).get_labels()

	return c,output

if __name__=='__main__':
	print('CHAIDTree')
	multiclass_chaidtree_modular(*parameter_list[0])

examples/documented/python_modular/multiclass_id3classifiertree_modular.py

#!/usr/bin/env python
from numpy import array

# create data
train_data = array([[1.0, 2.0, 1.0, 3.0, 1.0, 3.0, 2.0, 2.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 2.0],
[2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0],
[3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0],
[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]])

train_labels = array([1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0])

test_data = array([[2.0, 2.0, 1.0, 3.0, 3.0],
[2.0, 1.0, 2.0, 1.0, 2.0],
[3.0, 2.0, 1.0, 3.0, 2.0],
[1.0, 2.0, 1.0, 2.0, 1.0]])

parameter_list = [[train_data, train_labels, test_data]]

def multiclass_id3classifiertree_modular(train=train_data,labels=train_labels,test=test_data):
	try:
		from modshogun import RealFeatures, MulticlassLabels, ID3ClassifierTree
	except ImportError:
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(train)
	feats_test=RealFeatures(test)
	feats_labels=MulticlassLabels(labels)

	# ID3 Tree formation
	id3=ID3ClassifierTree()
	id3.set_labels(feats_labels)
	id3.train(feats_train)

	# Classify test data
	output=id3.apply_multiclass(feats_test).get_labels()

	return id3,output

if __name__=='__main__':
	print('ID3ClassifierTree')
	multiclass_id3classifiertree_modular(*parameter_list[0])

examples/documented/python_modular/multiclass_randomforest_modular.py

#!/usr/bin/env python
from numpy import array

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

# set both input attributes as not nominal (ie. continuous)
feattypes = array([False, False])

parameter_list = [[traindat,testdat,label_traindat,feattypes]]

def multiclass_randomforest_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# Random Forest formation
	rand_forest=RandomForest(feats_train,train_labels,20,1)
	rand_forest.set_feature_types(ft)
	rand_forest.set_combination_rule(MajorityVote())
	rand_forest.train()

	# Classify test data
	output=rand_forest.apply_multiclass(feats_test).get_labels()

	return rand_forest,output

if __name__=='__main__':
	print('RandomForest')
	multiclass_randomforest_modular(*parameter_list[0])

Preprocessor

examples/documented/python_modular/preprocessor_dimensionreductionpreprocessor_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix

lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data, 20], [data, 30]]

def preprocessor_dimensionreductionpreprocessor_modular (data, k):
	from modshogun import RealFeatures
	from modshogun import DimensionReductionPreprocessor
	try:
		from modshogun import LocallyLinearEmbedding
	except ImportError:
		print("LocallyLinearEmbedding not available")
		exit(0)

	features = RealFeatures(data)

	converter = LocallyLinearEmbedding()
	converter.set_k(k)

	preprocessor = DimensionReductionPreprocessor(converter)
	preprocessor.init(features)
	preprocessor.apply_to_feature_matrix(features)

	return features


if __name__=='__main__':
	print('DimensionReductionPreprocessor')
	preprocessor_dimensionreductionpreprocessor_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_fisherlda_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix
from modshogun import *


lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')
labels = lm.load_numbers('../data/label_train_multiclass.dat')

parameter_list = [[data, labels, CANVAR_FLDA], [data, labels, CLASSIC_FLDA]]
def preprocessor_fisherlda_modular (data, labels, method):

	from modshogun import RealFeatures, MulticlassLabels, CANVAR_FLDA
	from modshogun import FisherLda
	from modshogun import MulticlassLabels

	sg_features = RealFeatures(data)
	sg_labels = MulticlassLabels(labels)

	preprocessor=FisherLda(method)
	preprocessor.fit(sg_features, sg_labels, 1)
	yn=preprocessor.apply_to_feature_matrix(sg_features)

	return yn


if __name__=='__main__':
	print('FisherLda')
	preprocessor_fisherlda_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_kernelpca_modular.py

# In this example toy data is being processed using the kernel PCA algorithm
# as described in
# 
# Schölkopf, B., Smola, A. J., & Muller, K. R. (1999).
# Kernel Principal Component Analysis.
# Advances in kernel methods support vector learning, 1327(3), 327-352. MIT Press.
# Retrieved from http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.8744i
# 
# A gaussian kernel is used for the processing.

#!/usr/bin/env python
from tools.load import LoadMatrix

lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data, 0.01, 1.0], [data, 0.05, 2.0]]

def preprocessor_kernelpca_modular (data, threshold, width):
	from modshogun import RealFeatures
	from modshogun import KernelPCA
	from modshogun import GaussianKernel

	features = RealFeatures(data)

	kernel = GaussianKernel(features,features,width)

	preprocessor = KernelPCA(kernel)
	preprocessor.init(features)
	preprocessor.set_target_dim(2)
	preprocessor.apply_to_feature_matrix(features)

	return features


if __name__=='__main__':
	print('KernelPCA')
	preprocessor_kernelpca_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_logplusone_modular.py

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat+10,testdat+10,1.4,10],[traindat+10,testdat+10,1.5,10]]

def preprocessor_logplusone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):

	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import LogPlusOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=LogPlusOne()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()


	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('LogPlusOne')
	preprocessor_logplusone_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_normone_modular.py

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# NormOne, normalizes vectors to have norm 1.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,1.4,10],[traindat,testdat,1.5,10]]

def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):

	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import NormOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preprocessor=NormOne()
	preprocessor.init(feats_train)
	feats_train.add_preprocessor(preprocessor)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preprocessor)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('NormOne')
	preprocessor_normone_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_pca_modular.py

# In this example toy data is being processed using the
# Principal Component Analysis.

#!/usr/bin/env python
from tools.load import LoadMatrix

lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data]]

def preprocessor_pca_modular (data):
	from modshogun import RealFeatures
	from modshogun import PCA

	features = RealFeatures(data)

	preprocessor = PCA()
	preprocessor.init(features)
	preprocessor.apply_to_feature_matrix(features)

	return features


if __name__=='__main__':
	print('PCA')
	preprocessor_pca_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_prunevarsubmean_modular.py

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# PruneVarSubMean substracts the mean from each feature and removes features that
# have zero variance.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]]

def preprocessor_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import PruneVarSubMean

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=PruneVarSubMean()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('PruneVarSubMean')
	preprocessor_prunevarsubmean_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_randomfouriergausspreproc_modular.py

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]]

from modshogun import Math_init_random;
Math_init_random(12345);

def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from modshogun import Chi2Kernel
	from modshogun import RealFeatures
	from modshogun import RandomFourierGaussPreproc

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=RandomFourierGaussPreproc()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('RandomFourierGaussPreproc')
	preprocessor_randomfouriergausspreproc_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_sortulongstring_modular.py

# In this example a kernel matrix is computed for a given string data set. The
# CommUlongString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 64bit integers. These 64bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortUlongString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation enables
# spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for
# 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented
# (though there is room for improvement here when a whole set of sequences is
# ADDed) using sorted lists.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindna,testdna,4,0,False,False],[traindna,testdna,3,0,False,False]]

def preprocessor_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import CommUlongStringKernel
	from modshogun import StringCharFeatures, StringUlongFeatures, DNA
	from modshogun import SortUlongString


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel

if __name__=='__main__':
	print('CommUlongString')
	preprocessor_sortulongstring_modular(*parameter_list[0])

examples/documented/python_modular/preprocessor_sortwordstring_modular.py

# In this example a kernel matrix is computed for a given string data set. The
# CommWordString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 16bit integers. These 16bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortWordString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation is
# especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it
# enables spectrum kernels of order up to 8. For this kernel the linadd speedups
# are quite efficiently implemented using direct maps.

#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindna,testdna,3,0,False,False],[traindna,testdna,3,0,False,False]]

def preprocessor_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from modshogun import CommWordStringKernel
	from modshogun import StringCharFeatures, StringWordFeatures, DNA
	from modshogun import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel

if __name__=='__main__':
	print('CommWordString')
	preprocessor_sortwordstring_modular(*parameter_list[0])

Regression

examples/documented/python_modular/regression_cartree_modular.py

#!/usr/bin/env python
from numpy import array

# set both input attributes as not nominal (ie. continuous)
feattypes = array([False])

parameter_list = [[50,5,15,0.2,feattypes]]

def regression_cartree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
	try:
		from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, PT_REGRESSION
		from numpy import random
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	random.seed(1)

	# form training dataset : y=x with noise
	X_train=random.rand(1,num_train)*x_range;
	Y_train=X_train+random.randn(num_train)*noise_var

	# form test dataset
	X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(X_train)
	feats_test=RealFeatures(X_test)
	train_labels=RegressionLabels(Y_train[0])

	# CART Tree formation
	c=CARTree(ft,PT_REGRESSION,5,True)
	c.set_labels(train_labels)
	c.train(feats_train)

	# Classify test data
	output=c.apply_regression(feats_test).get_labels()

	return c,output

if __name__=='__main__':
	print('CARTree')
	regression_cartree_modular(*parameter_list[0])

examples/documented/python_modular/regression_chaidtree_modular.py

#!/usr/bin/env python
from numpy import array, dtype, int32

# set input attribute as continuous i.e. 2
feattypes = array([2],dtype=int32)

parameter_list = [[500,50,15,0.2,feattypes]]

def regression_chaidtree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
	try:
		from modshogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION
		from numpy import random
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	random.seed(1)

	# form training dataset : y=x with noise
	X_train=random.rand(1,num_train)*x_range;
	Y_train=X_train+random.randn(num_train)*noise_var

	# form test dataset
	X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(X_train)
	feats_test=RealFeatures(X_test)
	train_labels=RegressionLabels(Y_train[0])

	# CHAID Tree formation
	c=CHAIDTree(2,feattypes,50)
	c.set_labels(train_labels)
	c.train(feats_train)

	# Regress on test data
	output=c.apply_regression(feats_test).get_labels()

	return c,output

if __name__=='__main__':
	print('CHAIDTree')
	regression_chaidtree_modular(*parameter_list[0])

examples/documented/python_modular/regression_randomforest_modular.py

#!/usr/bin/env python
from numpy import array, random

traindat = '../data/fm_train_real.dat'
testdat = '../data/fm_test_real.dat'
label_traindat = '../data/label_train_multiclass.dat'

# set input attribute as not nominal (ie. continuous)
feattypes = array([False])

parameter_list = [[500,50,15,0.2,feattypes]]

def regression_randomforest_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
	try:
		from modshogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION
	except ImportError:
		print("Could not import Shogun modules")
		return

	random.seed(1)

	# form training dataset : y=x with noise
	X_train=random.rand(1,num_train)*x_range;
	Y_train=X_train+random.randn(num_train)*noise_var

	# form test dataset
	X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(X_train)
	feats_test=RealFeatures(X_test)
	train_labels=RegressionLabels(Y_train[0])

	# Random Forest formation
	rand_forest=RandomForest(feats_train,train_labels,20,1)
	rand_forest.set_feature_types(ft)
	rand_forest.set_machine_problem_type(PT_REGRESSION)
	rand_forest.set_combination_rule(MeanRule())
	rand_forest.train()

	# Regress test data
	output=rand_forest.apply_regression(feats_test).get_labels()

	return rand_forest,output

if __name__=='__main__':
	print('RandomForest')
	regression_randomforest_modular(*parameter_list[0])

examples/documented/python_modular/regression_svrlight_modular.py

# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1. The the label of both the train and the test data are
# fetched via svr.classify().get_labels().
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.

#!/usr/bin/env python
###########################################################################
# svm light based support vector regression
###########################################################################
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat,1.2,1,1e-5,1e-2,1],[traindat,testdat,label_traindat,2.3,0.5,1e-5,1e-6,1]]

def regression_svrlight_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \
				    width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3):


	from modshogun import RegressionLabels, RealFeatures
	from modshogun import GaussianKernel
	try:
		from modshogun import SVRLight
	except ImportError:
		print('No support for SVRLight available.')
		return

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=RegressionLabels(label_train)

	svr=SVRLight(C, epsilon, kernel, labels)
	svr.set_tube_epsilon(tube_epsilon)
	svr.parallel.set_num_threads(num_threads)
	svr.train()

	kernel.init(feats_train, feats_test)
	out = svr.apply().get_labels()

	return out, kernel

if __name__=='__main__':
	print('SVRLight')
	regression_svrlight_modular(*parameter_list[0])

Serialization

examples/documented/python_modular/serialization_complex_example.py

# In this example serialization of SVM (Support Vector Machine) is shown

#!/usr/bin/env python
parameter_list=[[10,0.3,2, 1.0, 0.1]]

def check_status(status,suffix):
	# silent...
	assert status, "ERROR reading/writing status:%s/suffic:%s\n" % (status,suffix)

def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10):
	import os
	from numpy import concatenate, zeros, ones
	from numpy.random import randn, seed
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import GMNPSVM
	from modshogun import GaussianKernel
	from modshogun import SerializableHdf5File,SerializableAsciiFile, \
			SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
	from modshogun import NormOne, LogPlusOne

	seed(17)

	data=concatenate((randn(dim, num), randn(dim, num) + dist,
					  randn(dim, num) + 2*dist,
					  randn(dim, num) + 3*dist), axis=1)
	lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))

	feats=RealFeatures(data)
	#feats.io.set_loglevel(MSG_DEBUG)
	#feats.io.enable_file_and_line()
	kernel=GaussianKernel(feats, feats, width)

	labels=MulticlassLabels(lab)

	svm = GMNPSVM(C, kernel, labels)

	feats.add_preprocessor(NormOne())
	feats.add_preprocessor(LogPlusOne())
	feats.set_preprocessed(1)
	svm.train(feats)
	bias_ref = svm.get_svm(0).get_bias()

	#svm.print_serializable()

	fstream = SerializableHdf5File("tmp/blaah.h5", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'h5')

	fstream = SerializableAsciiFile("tmp/blaah.asc", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'asc')

	fstream = SerializableJsonFile("tmp/blaah.json", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'json')

	fstream = SerializableXmlFile("tmp/blaah.xml", "w")
	status = svm.save_serializable(fstream)
	check_status(status,'xml')

	fstream = SerializableHdf5File("tmp/blaah.h5", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'h5')
	new_svm.train()
	bias_h5 = new_svm.get_svm(0).get_bias()

	fstream = SerializableAsciiFile("tmp/blaah.asc", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'asc')
	new_svm.train()
	bias_asc = new_svm.get_svm(0).get_bias()

	fstream = SerializableJsonFile("tmp/blaah.json", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'json')
	new_svm.train()
	bias_json = new_svm.get_svm(0).get_bias()

	fstream = SerializableXmlFile("tmp/blaah.xml", "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'xml')
	new_svm.train()
	bias_xml = new_svm.get_svm(0).get_bias()

	os.unlink("tmp/blaah.h5")
	os.unlink("tmp/blaah.asc")
	os.unlink("tmp/blaah.json")
	os.unlink("tmp/blaah.xml")
	return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml


if __name__=='__main__':
	print('Serialization SVMLight')
	serialization_complex_example(*parameter_list[0])

examples/documented/python_modular/serialization_matrix_modular.py

# In this example dense toy features is being serialized

#!/usr/bin/env python
from modshogun import *
from numpy import array
import os

parameter_list=[[[[1.0,2,3],[4,5,6]]]]

def serialization_matrix_modular (m):
	feats=RealFeatures(array(m))
	#feats.io.set_loglevel(0)
	fstream = SerializableAsciiFile("tmp/foo.asc", "w")
	feats.save_serializable(fstream)

	l=MulticlassLabels(array([1.0,2,3]))
	fstream = SerializableAsciiFile("tmp/foo2.asc", "w")
	l.save_serializable(fstream)

	os.unlink("tmp/foo.asc")
	os.unlink("tmp/foo2.asc")

if __name__=='__main__':
	print('Serialization Matrix Modular')
	serialization_matrix_modular(*parameter_list[0])

examples/documented/python_modular/serialization_string_kernels_modular.py

#!/usr/bin/env python

from modshogun import WeightedDegreeStringKernel, LinearKernel, PolyKernel, GaussianKernel, CTaxonomy
from modshogun import CombinedKernel, WeightedDegreeRBFKernel
from modshogun import StringCharFeatures, RealFeatures, CombinedFeatures, StringWordFeatures, SortWordString
from modshogun import DNA, PROTEIN, Labels
from modshogun import WeightedDegreeStringKernel, CombinedKernel, WeightedCommWordStringKernel, WeightedDegreePositionStringKernel
from modshogun import StringCharFeatures, DNA, StringWordFeatures, CombinedFeatures

from modshogun import MSG_DEBUG
from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet
from modshogun import WeightedDegreeStringKernel, GaussianKernel
try:
	from modshogun import SVMLight
except ImportError:
	print("SVMLight is not available")
	exit(0)
from numpy import concatenate, ones
from numpy.random import randn, seed
import numpy
import sys
import types
import random
import bz2
import pickle
import inspect

###################################################
#             Random Data
###################################################

def generate_random_string(length, number):
    """
    generate sample over alphabet
    """

    dat = []

    alphabet = "AGTC"

    for i in range(number):
        dat.append("".join([random.choice(alphabet) for j in range(length)]))

    return dat


def generate_random_data(number):
    """
    create random examples and labels
    """

    labels = numpy.array([random.choice([-1.0, 1.0]) for i in range(number)])
    examples = numpy.array(generate_random_string(22, number))

    return examples, labels


def save(filename, myobj):
    """
    save object to file using pickle

    @param filename: name of destination file
    @type filename: str
    @param myobj: object to save (has to be pickleable)
    @type myobj: obj
    """

    try:
        f = bz2.BZ2File(filename, 'wb')
    except IOError as details:
        sys.stderr.write('File ' + filename + ' cannot be written\n')
        sys.stderr.write(details)
        return

    pickle.dump(myobj, f, protocol=2)
    f.close()


def load(filename):
    """
    Load from filename using pickle

    @param filename: name of file to load from
    @type filename: str
    """

    try:
        f = bz2.BZ2File(filename, 'rb')
    except IOError as details:
        sys.stderr.write('File ' + filename + ' cannot be read\n')
        sys.stderr.write(details)
        return

    myobj = pickle.load(f)
    f.close()
    return myobj


def get_spectrum_features(data, order=3, gap=0, reverse=True):
    """
    create feature object used by spectrum kernel
    """

    charfeat = StringCharFeatures(data, DNA)
    feat = StringWordFeatures(charfeat.get_alphabet())
    feat.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feat)
    feat.add_preprocessor(preproc)
    feat.apply_preprocessor()

    return feat


def get_wd_features(data, feat_type="dna"):
    """
    create feature object for wdk
    """
    if feat_type == "dna":
        feat = StringCharFeatures(DNA)
    elif feat_type == "protein":
        feat = StringCharFeatures(PROTEIN)
    else:
        raise Exception("unknown feature type")
    feat.set_features(data)

    return feat


def construct_features(features):
    """
    makes a list
    """

    feat_all = [inst for inst in features]
    feat_lhs = [inst[0:15] for inst in features]
    feat_rhs = [inst[15:] for inst in features]

    feat_wd = get_wd_features(feat_all)
    feat_spec_1 = get_spectrum_features(feat_lhs, order=3)
    feat_spec_2 = get_spectrum_features(feat_rhs, order=3)

    feat_comb = CombinedFeatures()
    feat_comb.append_feature_obj(feat_wd)
    feat_comb.append_feature_obj(feat_spec_1)
    feat_comb.append_feature_obj(feat_spec_2)

    return feat_comb

parameter_list = [[200, 1, 100]]

def serialization_string_kernels_modular(n_data, num_shifts, size):
    """
    serialize svm with string kernels
    """

    ##################################################
    # set up toy data and svm
    train_xt, train_lt = generate_random_data(n_data)
    test_xt, test_lt = generate_random_data(n_data)

    feats_train = construct_features(train_xt)
    feats_test = construct_features(test_xt)

    max_len = len(train_xt[0])
    kernel_wdk = WeightedDegreePositionStringKernel(size, 5)
    shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts
    kernel_wdk.set_shifts(shifts_vector)

    ########
    # set up spectrum
    use_sign = False
    kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign)
    kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign)

    ########
    # combined kernel
    kernel = CombinedKernel()
    kernel.append_kernel(kernel_wdk)
    kernel.append_kernel(kernel_spec_1)
    kernel.append_kernel(kernel_spec_2)

    # init kernel
    labels = BinaryLabels(train_lt);

    svm = SVMLight(1.0, kernel, labels)
    #svm.io.set_loglevel(MSG_DEBUG)
    svm.train(feats_train)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    #print("unserializing SVM")
    svm2 = load(fn)


    #print("comparing predictions")
    out =  svm.apply(feats_test).get_labels()
    out2 =  svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in range(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    #print("all checks passed.")

    return out,out2


if __name__=='__main__':
    serialization_string_kernels_modular(*parameter_list[0])

examples/documented/python_modular/serialization_svmlight_modular.py

# This example shows how to use boost serialization (only available if the compile flag was enabled)
# to serialize/deserialize an SVMLight object. Note that this code is in alpha state.

#!/usr/bin/env python
parameter_list=[[10, 1, 2.1, 2.0]]

def serialization_svmlight_modular (num, dist, width, C):
    from modshogun import MSG_DEBUG
    from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet
    from modshogun import WeightedDegreeStringKernel, GaussianKernel
    try:
        from modshogun import SVMLight
    except ImportError:
        print("SVMLight not available")
        exit(0)
    from numpy import concatenate, ones
    from numpy.random import randn, seed

    import sys
    import types
    import random
    import bz2
    import pickle
    import inspect


    def save(filename, myobj):
        """
        save object to file using pickle

        @param filename: name of destination file
        @type filename: str
        @param myobj: object to save (has to be pickleable)
        @type myobj: obj
        """

        try:
            f = bz2.BZ2File(filename, 'wb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be written\n')
            sys.stderr.write(details)
            return

        pickle.dump(myobj, f, protocol=2)
        f.close()



    def load(filename):
        """
        Load from filename using pickle

        @param filename: name of file to load from
        @type filename: str
        """

        try:
            f = bz2.BZ2File(filename, 'rb')
        except IOError as details:
            sys.stderr.write('File ' + filename + ' cannot be read\n')
            sys.stderr.write(details)
            return

        myobj = pickle.load(f)
        f.close()
        return myobj


    ##################################################
    # set up toy data and svm

    traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1)
    testdata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1);

    trainlab = concatenate((-ones(num), ones(num)));
    testlab = concatenate((-ones(num), ones(num)));

    feats_train = RealFeatures(traindata_real);
    feats_test = RealFeatures(testdata_real);
    kernel = GaussianKernel(feats_train, feats_train, width);
    #kernel.io.set_loglevel(MSG_DEBUG)

    labels = BinaryLabels(trainlab);

    svm = SVMLight(C, kernel, labels)
    svm.train()
    #svm.io.set_loglevel(MSG_DEBUG)

    ##################################################
    # serialize to file

    fn = "serialized_svm.bz2"
    #print("serializing SVM to file", fn)
    save(fn, svm)

    ##################################################
    # unserialize and sanity check

    #print("unserializing SVM")
    svm2 = load(fn)

    #print("comparing objectives")

    svm2.train()

    #print("objective before serialization:", svm.get_objective())
    #print("objective after serialization:", svm2.get_objective())

    #print("comparing predictions")

    out =  svm.apply(feats_test).get_labels()
    out2 =  svm2.apply(feats_test).get_labels()

    # assert outputs are close
    for i in range(len(out)):
        assert abs(out[i] - out2[i] < 0.000001)

    #print("all checks passed.")

    return True


if __name__=='__main__':
    print('Serialization SVMLight')
    serialization_svmlight_modular(*parameter_list[0])

So

examples/documented/python_modular/so_multiclass.py

#!/usr/bin/env python

import numpy as np

def gen_data(num_classes,num_samples,dim):
	np.random.seed(0)
	covs = np.array([[[0., -1. ], [2.5,  .7]],
			 [[3., -1.5], [1.2, .3]],
			 [[ 2,  0  ], [ .0,  1.5 ]]])
	X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]),
		  np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]),
		  np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])];
	Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples)))
	return X, Y

# Number of classes
M = 3
# Number of samples of each class
N = 50
# Dimension of the data
dim = 2

traindat, label_traindat = gen_data(M,N,dim)

parameter_list = [[traindat,label_traindat]]

def so_multiclass (fm_train_real=traindat,label_train_multiclass=label_traindat):
	try:
		from modshogun	import RealFeatures
		from modshogun	import MulticlassModel, MulticlassSOLabels, PrimalMosekSOSVM, RealNumber
	except ImportError:
		print("Mosek not available")
		return

	labels = MulticlassSOLabels(label_train_multiclass)
	features = RealFeatures(fm_train_real.T)

	model = MulticlassModel(features, labels)
	sosvm = PrimalMosekSOSVM(model, labels)
	sosvm.train()

	out = sosvm.apply()
	count = 0
	for i in xrange(out.get_num_labels()):
		yi_pred = RealNumber.obtain_from_generic(out.get_label(i))
		if yi_pred.value == label_train_multiclass[i]:
			count = count + 1

	print("Correct classification rate: %0.2f" % ( 100.0*count/out.get_num_labels() ))

if __name__=='__main__':
	print('SO multiclass')
	so_multiclass(*parameter_list[0])

Statistics

examples/documented/python_modular/statistics_hsic.py

# In this example, HSIC, a kernel-based test for independence is used to detect
# dependence of a mixture of Gaussians and a rotated version of the same data.
# The HSIC statistic is computed and available methods for computing a threshold
# of the null distribution are used. In addition, p-values of the test are
# computed. Note that these methods require more iterations than used here. A
# Gaussian kernel is selected via the median heuristic.
# See tutorial and Class documentation for more details.

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
import numpy as np
from math import pi

parameter_list = [[150,3,3]]

def statistics_hsic (n, difference, angle):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel
	from modshogun import HSIC
	from modshogun import PERMUTATION, HSIC_GAMMA
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# for reproducable results (the numpy one might not be reproducible across
	# different OS/Python-distributions
	Math.init_random(1)
	np.random.seed(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(np.array([data[0]]))
	features_y=RealFeatures(np.array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=np.random.permutation(features_x.get_num_vectors()).astype(np.int32)
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=np.median(distances)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=np.median(distances)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using sampling null"
	hsic.set_null_approximation_method(PERMUTATION)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_num_null_samples(100)
	# sampling null allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# sampling null, biased statistic
	#print "sampling null distribution using sample_null"
	hsic.set_null_approximation_method(PERMUTATION)
	hsic.set_num_null_samples(100)
	null_samples=hsic.sample_null()
	#print "null mean:", np.mean(null_samples)
	#print "null variance:", np.var(null_samples)
	#hist(null_samples, 100); show()

	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples

if __name__=='__main__':
	print('HSIC')
	statistics_hsic(*parameter_list[0])

examples/documented/python_modular/statistics_kmm.py

#!/usr/bin/env python
from numpy import *
from numpy import random

parameter_list = [[10,3]]

def statistics_kmm (n,d):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel, MSG_DEBUG
	try:
		from modshogun import KernelMeanMatching
	except ImportError:
		print("KernelMeanMatching not available")
		exit(0)
	from modshogun import Math

	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	data = random.randn(d,n)

	# create shogun feature representation
	features=RealFeatures(data)

	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
	kernel=GaussianKernel(10,8)
	kernel.init(features,features)

	kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
	w = kmm.compute_weights()
	#print w
	return w

if __name__=='__main__':
	print('KernelMeanMatching')
	statistics_kmm(*parameter_list[0])

examples/documented/python_modular/statistics_linear_time_mmd.py

# In this example, the linear time MMD statistic for kernel-based two-sample
# testing is illustrated. It is a streaming based statistic for large amounts
# of data. The used dataset is a bunch of standard Gaussian vectors where the
# first dimensions differs in both distributions p and q. The test statistic
# is computed and available methods for computing a threshold of the null
# distribution are used. In addition, p-values for the test are computed.
# Note that these methods require more iterations/samples that used here. A
# Gaussian is selected via the median heuristic. There are more clever
# kernel selection methods available.
# See tutorial and Class documentation for more details.

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *

parameter_list = [[1000,2,0.5]]

def statistics_linear_time_mmd (n,dim,difference):
	from modshogun import RealFeatures
	from modshogun import MeanShiftDataGenerator
	from modshogun import GaussianKernel
	from modshogun import LinearTimeMMD
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# so increase to get reasonable results

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable

	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))

	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()

	# compute median and determine kernel width
	median_distance=median(distances)
	sigma=median_distance**2
	#print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	#print "test statistic:", statistic

	# do the same thing using two different way to approximate null-dstribution
	# sampling null and gaussian approximation (ony for really large samples)
	alpha=0.05

	#print "computing p-value using sampling null"
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(50) # normally, far more iterations are needed
	p_value_boot=mmd.compute_p_value(statistic)
	#print "p_value_boot:", p_value_boot
	#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

	#print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value_gaussian=mmd.compute_p_value(statistic)
	#print "p_value_gaussian:", p_value_gaussian
	#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(10) # normally, far more iterations are needed
	null_samples=mmd.sample_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)

	# compute type I and type II errors for Gaussian approximation
	# number of trials should be larger to compute tight confidence bounds
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors

if __name__=='__main__':
	print('LinearTimeMMD')
	statistics_linear_time_mmd(*parameter_list[0])

examples/documented/python_modular/statistics_mmd_kernel_selection_combined.py

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
#from pylab import *

parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "l2"]]


def statistics_mmd_kernel_selection_combined(m,distance,stretch,num_blobs,angle,selection_method):
	from modshogun import RealFeatures
	from modshogun import GaussianBlobsDataGenerator
	from modshogun import GaussianKernel, CombinedKernel
	from modshogun import LinearTimeMMD
	try:
		from modshogun import MMDKernelSelectionCombMaxL2
	except ImportError:
		print("MMDKernelSelectionCombMaxL2 not available")
		exit(0)
	try:
		from modshogun import MMDKernelSelectionCombOpt
	except ImportError:
		print("MMDKernelSelectionCombOpt not available")
		exit(0)
		
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# results for this low number will be bad (unstable, type I error wrong)

	# streaming data generator
	gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
	gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)

	# stream some data and plot
	num_plot=1000
	features=gen_p.get_streamed_features(num_plot)
	features=features.create_merged_copy(gen_q.get_streamed_features(num_plot))
	data=features.get_feature_matrix()

	#figure()
	#subplot(2,2,1)
	#grid(True)
	#plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
	#title('$X\sim p$')
	#subplot(2,2,2)
	#grid(True)
	#plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
	#title('$Y\sim q$')

	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# different to the standard form, see documentation)
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=10000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# combined kernels
	if selection_method=="opt":
		selection=MMDKernelSelectionCombOpt(mmd)
	elif selection_method=="l2":
		selection=MMDKernelSelectionCombMaxL2(mmd)

	# perform kernel selection (kernel is automatically set)
	kernel=selection.select_kernel()
	kernel=CombinedKernel.obtain_from_generic(kernel)
	#print "selected kernel weights:", kernel.get_subkernel_weights()
	#subplot(2,2,3)
	#plot(kernel.get_subkernel_weights())
	#title("Kernel weights")

	# compute tpye I and II error (use many more trials). Type I error is only
	# estimated to check MMD1_GAUSSIAN method for estimating the null
	# distribution. Note that testing has to happen on difference data than
	# kernel selecting, but the linear time mmd does this implicitly
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)

	# number of trials should be larger to compute tight confidence bounds
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return kernel,typeIerrors,typeIIerrors

if __name__=='__main__':
	print('MMDKernelSelectionCombined')
	statistics_mmd_kernel_selection_combined(*parameter_list[0])
	#show()

examples/documented/python_modular/statistics_mmd_kernel_selection_single.py

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
from numpy import *
#from pylab import *

parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "max"], [1000,10,5,3,pi/4, "median"]]

def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method):
	from modshogun import RealFeatures
	from modshogun import GaussianBlobsDataGenerator
	from modshogun import GaussianKernel, CombinedKernel
	from modshogun import LinearTimeMMD
	from modshogun import MMDKernelSelectionMedian
	from modshogun import MMDKernelSelectionMax
	from modshogun import MMDKernelSelectionOpt
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# results for this low number will be bad (unstable, type I error wrong)
	m=1000
	distance=10
	stretch=5
	num_blobs=3
	angle=pi/4

	# streaming data generator
	gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
	gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)

	# stream some data and plot
	num_plot=1000
	features=gen_p.get_streamed_features(num_plot)
	features=features.create_merged_copy(gen_q.get_streamed_features(num_plot))
	data=features.get_feature_matrix()

	#figure()
	#subplot(2,2,1)
	#grid(True)
	#plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
	#title('$X\sim p$')
	#subplot(2,2,2)
	#grid(True)
	#plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
	#title('$Y\sim q$')


	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# different to the standard form, see documentation)
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	if selection_method=="opt":
		selection=MMDKernelSelectionOpt(mmd)
	elif selection_method=="max":
		selection=MMDKernelSelectionMax(mmd)
	elif selection_method=="median":
		selection=MMDKernelSelectionMedian(mmd)

	# print measures (just for information)
	# in case Opt: ratios of MMD and standard deviation
	# in case Max: MMDs for each kernel
	# Does not work for median method
	if selection_method!="median":
		ratios=selection.compute_measures()
		#print "Measures:", ratios

	#subplot(2,2,3)
	#plot(ratios)
	#title('Measures')

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	#print "selected kernel width:", kernel.get_width()

	# compute tpye I and II error (use many more trials). Type I error is only
	# estimated to check MMD1_GAUSSIAN method for estimating the null
	# distribution. Note that testing has to happen on difference data than
	# kernel selecting, but the linear time mmd does this implicitly
	mmd.set_kernel(kernel)
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)

	# number of trials should be larger to compute tight confidence bounds
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return kernel,typeIerrors,typeIIerrors

if __name__=='__main__':
	print('MMDKernelSelection')
	statistics_mmd_kernel_selection_single(*parameter_list[0])
	#show()

examples/documented/python_modular/statistics_quadratic_time_mmd.py

# In this example, the quadratic time MMD statistic for kernel-based two-sample
# testing is illustrated. It is a statistic for smaller amounts of data where
# one is interested to compute the best possible test. The used dataset is a
# bunch of standard Gaussian vectors where the first dimensions differs in both
# distributions p and q. The test statistic is computed and available methods
# for computing a threshold of the null distribution are used. In addition,
# p-values for the test are computed. Note that these methods require more
# iterations/samples that used here. A Gaussian is with a fixed kernel size is
# used. There are more clever kernel selection methods available.
# See tutorial and Class documentation for more details.

#!/usr/bin/env python
#
# This program is free software you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation either version 3 of the License, or
# (at your option) any later version.
#
# Written (C) 2012-2013 Heiko Strathmann
#
import numpy as np

parameter_list = [[30,2,0.5]]

def statistics_quadratic_time_mmd (m,dim,difference):
	from modshogun import RealFeatures
	from modshogun import MeanShiftDataGenerator
	from modshogun import GaussianKernel, CustomKernel
	from modshogun import QuadraticTimeMMD
	from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED
	from modshogun import Statistics, IntVector, RealVector, Math

	# for reproducable results (the numpy one might not be reproducible across
	# different OS/Python-distributions
	Math.init_random(1)
	np.random.seed(1)

	# number of examples kept low in order to make things fast

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim);
	#gen_p.parallel.set_num_threads(1)
	gen_q=MeanShiftDataGenerator(difference, dim);

	# stream some data from generator
	feat_p=gen_p.get_streamed_features(m);
	feat_q=gen_q.get_streamed_features(m);

	# set kernel a-priori. usually one would do some kernel selection. See
	# other examples for this.
	width=10;
	kernel=GaussianKernel(10, width);

	# create quadratic time mmd instance. Note that this constructor
	# copies p and q and does not reference them
	mmd=QuadraticTimeMMD(kernel, feat_p, feat_q);

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	alpha=0.05;

	# using permutation (slow, not the most reliable way. Consider pre-
	# computing the kernel when using it, see below).
	# Also, in practice, use at least 250 iterations
	mmd.set_null_approximation_method(PERMUTATION);
	mmd.set_num_null_samples(3);
	p_value_null=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "bootstrap: p!=q: ", p_value_null<alpha

	# using spectrum method. Use at least 250 samples from null.
	# This is consistent but sometimes breaks, always monitor type I error.
	# See tutorial for number of eigenvalues to use .
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(MMD2_SPECTRUM);
	mmd.set_num_eigenvalues_spectrum(3);
	mmd.set_num_samples_spectrum(250);
	p_value_spectrum=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "spectrum: p!=q: ", p_value_spectrum<alpha

	# using gamma method. This is a quick hack, which works most of the time
	# but is NOT guaranteed to. See tutorial for details.
	# Only works with BIASED_DEPRECATED statistic
	mmd.set_statistic_type(BIASED_DEPRECATED);
	mmd.set_null_approximation_method(MMD2_GAMMA);
	p_value_gamma=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "gamma: p!=q: ", p_value_gamma<alpha

	# compute tpye I and II error (use many more trials in practice).
	# Type I error is not necessary if one uses permutation. We do it here
	# anyway, but note that this is an efficient way of computing it.
	# Also note that testing has to happen on
	# difference data than kernel selection, but the linear time mmd does this
	# implicitly and we used a fixed kernel here.
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(PERMUTATION);
	mmd.set_num_null_samples(5);
	num_trials=5;
	type_I_errors=np.zeros(num_trials)
	type_II_errors=np.zeros(num_trials)
	inds=np.array([x for x in range(2*m)], dtype=np.int32)
	p_and_q=mmd.get_p_and_q();

	# use a precomputed kernel to be faster
	kernel.init(p_and_q, p_and_q);
	precomputed=CustomKernel(kernel);
	mmd.set_kernel(precomputed);
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		inds=np.random.permutation(inds) # numpy permutation
		precomputed.add_row_subset(inds);
		precomputed.add_col_subset(inds);
		type_I_errors[i]=mmd.perform_test()>alpha;
		precomputed.remove_row_subset();
		precomputed.remove_col_subset();

		# on normal data, this gives type II error
		type_II_errors[i]=mmd.perform_test()>alpha;

	return type_I_errors,type_I_errors,p_value_null,p_value_spectrum,p_value_gamma,

if __name__=='__main__':
	print('QuadraticTimeMMD')
	statistics_quadratic_time_mmd(*parameter_list[0])

Stochasticgbmachine

examples/documented/python_modular/stochasticgbmachine_modular.py

#!/usr/bin/env python
import numpy as np

traindat = '../../../data/uci/housing/fm_housing.dat'
label_traindat = '../../../data/uci/housing/housing_label.dat'

# set both input attributes as nominal (True) / continuous (False)
feat_types=np.array([False,False,False,True,False,False,False,False,False,False,False,False,False])

parameter_list = [[traindat,label_traindat,feat_types]]

def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types):
	try:
		from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats=RealFeatures(CSVFile(train))
	labels=RegressionLabels(CSVFile(train_labels))

	# divide into training (90%) and test dataset (10%)
	p=np.random.permutation(labels.get_num_labels())
	num=labels.get_num_labels()*0.9

	cart=CARTree()
	cart.set_feature_types(ft)
	cart.set_max_depth(1)
	loss=SquaredLoss()
	s=StochasticGBMachine(cart,loss,500,0.01,0.6)

	# train
	feats.add_subset(np.int32(p[0:num]))
	labels.add_subset(np.int32(p[0:num]))
	s.set_labels(labels)
	s.train(feats)
	feats.remove_subset()
	labels.remove_subset()

	# apply
	feats.add_subset(np.int32(p[num:len(p)]))
	labels.add_subset(np.int32(p[num:len(p)]))
	output=s.apply_regression(feats)

	feats.remove_subset()
	labels.remove_subset()

	return s,output

if __name__=='__main__':
	print('StochasticGBMachine')
	stochasticgbmachine_modular(*parameter_list[0])

Streaming

examples/documented/python_modular/streaming_vw_createcache_modular.py

#!/usr/bin/env python
from modshogun import StreamingVwFile
from modshogun import StreamingVwCacheFile
from modshogun import T_SVMLIGHT
from modshogun import StreamingVwFeatures
from modshogun import VowpalWabbit

parameter_list=[['../data/fm_train_sparsereal.dat']]

def streaming_vw_createcache_modular (fname):
	# First creates a binary cache from an ascii data file.
	# and then trains using the StreamingVwCacheFile as input

	# Open the input file as a StreamingVwFile
	input_file = StreamingVwFile(fname)
	# Default file name will be vw_cache.dat.cache
	input_file.set_write_to_cache(True)

	# Tell VW that the file is in SVMLight format
	# Supported types are T_DENSE, T_SVMLIGHT and T_VW
	input_file.set_parser_type(T_SVMLIGHT)

	## Create a StreamingVwFeatures object, `True' indicating the examples are labelled
	#features = StreamingVwFeatures(input_file, True, 1024)

	## Create a VW object from the features
	#vw = VowpalWabbit(features)
	#vw.set_no_training(True)

	## Train (in this case does nothing but run over all examples)
	#vw.train()

	##Finally Train using the generated cache file

	## Open the input cache file as a StreamingVwCacheFile
	#input_file = StreamingVwCacheFile("vw_cache.dat.cache");

	## The rest is exactly as for normal input
	#features = StreamingVwFeatures(input_file, True, 1024);
	#vw = VowpalWabbit(features)
	#vw.train()
	##return vw

if __name__ == "__main__":
	streaming_vw_createcache_modular(*parameter_list[0])

examples/documented/python_modular/streaming_vw_modular.py

#!/usr/bin/env python
from modshogun import StreamingVwFile
from modshogun import T_SVMLIGHT
from modshogun import StreamingVwFeatures
from modshogun import VowpalWabbit

parameter_list=[[None]]

def streaming_vw_modular (dummy):
	"""Runs the VW algorithm on a toy dataset in SVMLight format."""

	# Open the input file as a StreamingVwFile
	input_file = StreamingVwFile("../data/fm_train_sparsereal.dat")

	# Tell VW that the file is in SVMLight format
	# Supported types are T_DENSE, T_SVMLIGHT and T_VW
	input_file.set_parser_type(T_SVMLIGHT)

	## Create a StreamingVwFeatures object, `True' indicating the examples are labelled
	#features = StreamingVwFeatures(input_file, True, 1024)

	## Create a VW object from the features
	#vw = VowpalWabbit(features)

	## Train
	#vw.train()

	##return vw

if __name__ == "__main__":
	streaming_vw_modular(*parameter_list[0])

Structure

examples/documented/python_modular/structure_discrete_hmsvm_bmrm.py

#!/usr/bin/env python

import numpy
import scipy

from scipy import io
data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False)

parameter_list=[[data_dict]]

def structure_discrete_hmsvm_bmrm (m_data_dict=data_dict):
	from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel
	from modshogun import StructuredAccuracy, SMT_TWO_STATE
	try:
		from modshogun import DualLibQPBMSOSVM
	except ImportError:
		print("DualLibQPBMSOSVM not available")
		exit(0)

	labels_array = m_data_dict['label'][0]

	idxs = numpy.nonzero(labels_array == -1)
	labels_array[idxs] = 0

	labels = SequenceLabels(labels_array, 250, 500, 2)
	features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500)

	num_obs = 4	# given by the data file used
	model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs)

	sosvm = DualLibQPBMSOSVM(model, labels, 5000.0)
	sosvm.train()
	#print sosvm.get_w()

	predicted = sosvm.apply(features)
	evaluator = StructuredAccuracy()
	acc = evaluator.evaluate(predicted, labels)
	#print('Accuracy = %.4f' % acc)

if __name__ == '__main__':
	print("Discrete HMSVM BMRM")
	structure_discrete_hmsvm_bmrm(*parameter_list[0])

examples/documented/python_modular/structure_discrete_hmsvm_mosek.py

#!/usr/bin/env python

import numpy
import scipy

from scipy import io
data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False)

parameter_list=[[data_dict]]

def structure_discrete_hmsvm_mosek (m_data_dict=data_dict):
	from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel
	from modshogun import StructuredAccuracy, SMT_TWO_STATE

	try:
		from modshogun import PrimalMosekSOSVM
	except ImportError:
		print("Mosek not available")
		return

	labels_array = m_data_dict['label'][0]

	idxs = numpy.nonzero(labels_array == -1)
	labels_array[idxs] = 0

	labels = SequenceLabels(labels_array, 250, 500, 2)
	features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500)

	num_obs = 4	# given by the data file used
	model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs)

	sosvm = PrimalMosekSOSVM(model, labels)
	sosvm.train()
	#print(sosvm.get_w())

	predicted = sosvm.apply()
	evaluator = StructuredAccuracy()
	acc = evaluator.evaluate(predicted, labels)
	#print('Accuracy = %.4f' % acc)

if __name__ == '__main__':
	print("Discrete HMSVM Mosek")
	structure_discrete_hmsvm_mosek(*parameter_list[0])

examples/documented/python_modular/structure_dynprog_modular.py

# In this example we use the dynamic progaramm implementation with a
# gene finding specific model. The model and the training parameter
# are stored in a file and are used to create a gene prediction on
# some example sequence.

#!/usr/bin/env python
#!/usr/bin/env python
# -*- coding: utf-8 -*-

parameter_list=[['../data/DynProg_example_py.pickle.gz']]

from modshogun import *

import numpy
from numpy import array,Inf,float64,matrix,frompyfunc,zeros

#from IPython.Shell import IPShellEmbed
#ipshell = IPShellEmbed()

import gzip
import scipy
from scipy.io import loadmat

import pickle

try:
	from StringIO import StringIO
except ImportError:
	from io import BytesIO as StringIO

def get_ver(ver_str):
	scipy_ver=[int(i) for i in scipy.__version__.split('.')]
	v=0
	for i in range(len(scipy_ver)):
		v+=10**(len(scipy_ver)-i)*scipy_ver[i]
	return v

if get_ver(scipy.__version__) >= get_ver('0.7.0'):
	renametable = {
			'scipy.io.mio5': 'scipy.io.matlab.mio5',
			'scipy.sparse.sparse' : 'scipy.sparse',
			}
else:
	renametable = {}

def mapname(name):
	if name in renametable:
		return renametable[name]
	return name

# scipy compatibility class
class mat_struct(object):
    pass

def mapped_load_global(self):
	module = mapname(self.readline()[:-1])
	name = mapname(self.readline()[:-1])

	if name=='mat_struct':
		klass=mat_struct
	else:
		klass = self.find_class(module, name)

	self.append(klass)

def loads(str):
	file = StringIO(str)
	unpickler = pickle.Unpickler(file)
	unpickler.dispatch[pickle.GLOBAL] = mapped_load_global
	return unpickler.load()

def structure_dynprog_modular (fname):
	import sys

	#pickle is not compatible between python2 -> 3
	if sys.version_info[0]>2:
		return

	data_dict = loads(gzip.GzipFile(fname).read())
	#data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False)

	#print(data_dict)
	#print(len(data_dict['penalty_array'][0][0][0][0].limits[0]))
	num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits)
	pm = PlifMatrix()
	pm.create_plifs(num_plifs,num_limits)

	ids = numpy.array(list(range(num_plifs)),dtype=numpy.int32)
	min_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64)
	max_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64)
	all_use_cache = numpy.array(list(range(num_plifs)),dtype=numpy.bool)
	all_use_svm = numpy.array(list(range(num_plifs)),dtype=numpy.int32)
	all_limits = zeros((num_plifs,num_limits))
	all_penalties = zeros((num_plifs,num_limits))
	all_names = ['']*num_plifs
	all_transforms = ['']*num_plifs
	for plif_idx in range(num_plifs):
		ids[plif_idx]          = data_dict['penalty_array'][plif_idx].id-1
		min_values[plif_idx]   = data_dict['penalty_array'][plif_idx].min_value
		max_values[plif_idx]   = data_dict['penalty_array'][plif_idx].max_value
		all_use_cache[plif_idx]   = data_dict['penalty_array'][plif_idx].use_cache
		all_use_svm[plif_idx]   = data_dict['penalty_array'][plif_idx].use_svm
		all_limits[plif_idx]   = data_dict['penalty_array'][plif_idx].limits
		all_penalties[plif_idx]   = data_dict['penalty_array'][plif_idx].penalties
		all_names[plif_idx]   = str(data_dict['penalty_array'][plif_idx].name)
		all_transforms[plif_idx]   = str(data_dict['penalty_array'][plif_idx].transform)
		if all_transforms[plif_idx] == '[]':
			all_transforms[plif_idx] = 'linear'

	pm.set_plif_ids(ids)
	pm.set_plif_min_values(min_values)
	pm.set_plif_max_values(max_values)
	pm.set_plif_use_cache(all_use_cache)
	pm.set_plif_use_svm(all_use_svm)
	pm.set_plif_limits(all_limits)
	pm.set_plif_penalties(all_penalties)
	#pm.set_plif_names(all_names)
	#pm.set_plif_transform_type(all_transforms)

	transition_ptrs = data_dict['model'].transition_pointers
	transition_ptrs = transition_ptrs[:,:,0:2]
	transition_ptrs = transition_ptrs.astype(numpy.float64)

	pm.compute_plif_matrix(transition_ptrs)

	# init_dyn_prog
	num_svms = 8
	dyn = DynProg(num_svms)
	orf_info = data_dict['model'].orf_info
	orf_info = orf_info.astype(numpy.int32)
	num_states = orf_info.shape[0]
	dyn.set_num_states(num_states)

	block = data_dict['block']
	seq_len = len(block.seq)
	seq = str(block.seq)
	gene_string = array([elem for elem in seq])

	# precompute_content_svms
	pos = block.all_pos-1
	pos = pos.astype(numpy.int32)
	snd_pos = pos
	dyn.set_pos(pos)
	dyn.set_gene_string(gene_string)
	dyn.create_word_string()
	dyn.precompute_stop_codons()
	dyn.init_content_svm_value_array(num_svms)
	dict_weights = data_dict['content_weights']
	dict_weights = dict_weights.reshape(8,1).astype(numpy.float64)
	dict_weights = zeros((8,5440))
	dyn.set_dict_weights(dict_weights.T)

	dyn.precompute_content_values()

	dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32))
	pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32))

	dyn.set_orf_info(orf_info)

	#
	p = data_dict['model'].p
	q = data_dict['model'].q
	dyn.set_p_vector(p)
	dyn.set_q_vector(q)
	a_trans = data_dict['a_trans']
	a_trans = a_trans.astype(float64)

	dyn.set_a_trans_matrix(a_trans)


	dyn.check_svm_arrays()
	features = data_dict['block'].features

	dyn.set_observation_matrix(features)

	dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64))
	dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64))

	use_orf = True
	feat_dims = [25,201,2]

	dyn.set_plif_matrices(pm);

	#dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False)

	## fetch results
	#states = dyn.get_states()
	##print(states)
	#scores = dyn.get_scores()
	##print(scores)
	#positions = dyn.get_positions()
	##print(positions)

	#return states, scores, positions

if __name__ == '__main__':
	print("Structure")
	structure_dynprog_modular(*parameter_list[0])

examples/documented/python_modular/structure_factor_graph_model.py

#!/usr/bin/env python

import numpy as np
from modshogun import TableFactorType

# create the factor type with GT parameters
tid = 0
cards = np.array([2,2], np.int32)
w_gt = np.array([0.3,0.5,1.0,0.2,0.05,0.6,-0.2,0.75])
fac_type = TableFactorType(tid, cards, w_gt)

tid_u = 1
cards_u = np.array([2], np.int32)
w_gt_u = np.array([0.5,0.8,1.0,-0.3])
fac_type_u = TableFactorType(tid_u, cards_u, w_gt_u)

tid_b = 2
cards_b = np.array([2], np.int32)
w_gt_b = np.array([0.8, -0.8])
fac_type_b = TableFactorType(tid_b, cards_b, w_gt_b)

def gen_data(ftype, num_samples, show_data = False):
	from modshogun import Math
	from modshogun import FactorType, Factor, TableFactorType, FactorGraph
	from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
	from modshogun import MAPInference, TREE_MAX_PROD

	Math.init_random(17)

	samples = FactorGraphFeatures(num_samples)
	labels = FactorGraphLabels(num_samples)

	for i in range(num_samples):
		vc = np.array([2,2,2], np.int32)
		fg = FactorGraph(vc)

		data1 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
		vind1 = np.array([0,1], np.int32)
		fac1 = Factor(ftype[0], vind1, data1)
		fg.add_factor(fac1)

		data2 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
		vind2 = np.array([1,2], np.int32)
		fac2 = Factor(ftype[0], vind2, data2)
		fg.add_factor(fac2)

		data3 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
		vind3 = np.array([0], np.int32)
		fac3 = Factor(ftype[1], vind3, data3)
		fg.add_factor(fac3)

		data4 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
		vind4 = np.array([1], np.int32)
		fac4 = Factor(ftype[1], vind4, data4)
		fg.add_factor(fac4)

		data5 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)])
		vind5 = np.array([2], np.int32)
		fac5 = Factor(ftype[1], vind5, data5)
		fg.add_factor(fac5)

		data6 = np.array([1.0])
		vind6 = np.array([0], np.int32)
		fac6 = Factor(ftype[2], vind6, data6)
		fg.add_factor(fac6)

		data7 = np.array([1.0])
		vind7 = np.array([2], np.int32)
		fac7 = Factor(ftype[2], vind7, data7)
		fg.add_factor(fac7)

		samples.add_sample(fg)
		fg.connect_components()
		fg.compute_energies()

		infer_met = MAPInference(fg, TREE_MAX_PROD)
		infer_met.inference()

		fg_obs = infer_met.get_structured_outputs()
		labels.add_label(fg_obs)

		if show_data:
			state = fg_obs.get_data()
			print(state)

	return samples, labels


w_all = [w_gt,w_gt_u,w_gt_b]
ftype_all = [fac_type,fac_type_u,fac_type_b]

num_samples = 10
samples, labels = gen_data(ftype_all, num_samples)

parameter_list = [[samples,labels,w_all,ftype_all]]

def structure_factor_graph_model(tr_samples = samples, tr_labels = labels, w = w_all, ftype = ftype_all):
	from modshogun import SOSVMHelper, LabelsFactory
	from modshogun import FactorGraphModel, MAPInference, TREE_MAX_PROD
	from modshogun import StochasticSOSVM, FWSOSVM
	try:
		from modshogun import DualLibQPBMSOSVM
	except ImportError:
		print("DualLibQPBMSOSVM not available")
		exit(0)

	# create model
	model = FactorGraphModel(tr_samples, tr_labels, TREE_MAX_PROD, False)
	w_truth = [w[0].copy(), w[1].copy(), w[2].copy()]
	w[0] = np.zeros(8)
	w[1] = np.zeros(4)
	w[2] = np.zeros(2)
	ftype[0].set_w(w[0])
	ftype[1].set_w(w[1])
	ftype[2].set_w(w[2])
	model.add_factor_type(ftype[0])
	model.add_factor_type(ftype[1])
	model.add_factor_type(ftype[2])

	# --- training with BMRM ---
	bmrm = DualLibQPBMSOSVM(model, tr_labels, 0.01)
	#bmrm.set_verbose(True)
	bmrm.train()
	#print 'learned weights:'
	#print bmrm.get_w()
	#print 'ground truth weights:'
	#print w_truth

	# evaluation
	lbs_bmrm = bmrm.apply()
	acc_loss = 0.0
	ave_loss = 0.0
	for i in range(num_samples):
		y_pred = lbs_bmrm.get_label(i)
		y_truth = tr_labels.get_label(i)
		acc_loss = acc_loss + model.delta_loss(y_truth, y_pred)

	ave_loss = acc_loss / num_samples

	#print('BMRM: Average training error is %.4f' % ave_loss)

	# show primal objs and dual objs
	#hbm = bmrm.get_helper()
	#print hbm.get_primal_values()
	#print hbm.get_eff_passes()
	#print hbm.get_train_errors()

	# --- training with SGD ---
	sgd = StochasticSOSVM(model, tr_labels)
	#sgd.set_verbose(True)
	sgd.set_lambda(0.01)
	sgd.train()

	# evaluation
	#print('SGD: Average training error is %.4f' % SOSVMHelper.average_loss(sgd.get_w(), model))
	#hp = sgd.get_helper()
	#print hp.get_primal_values()
	#print hp.get_eff_passes()
	#print hp.get_train_errors()

	# --- training with FW ---
	fw = FWSOSVM(model, tr_labels)
	#fw.set_verbose(True)
	fw.set_lambda(0.01)
	fw.set_gap_threshold(0.01)
	fw.train()

	# evaluation
	#print('FW: Average training error is %.4f' % SOSVMHelper.average_loss(fw.get_w(), model))
	#hp = fw.get_helper()
	#print hp.get_primal_values()
	#print hp.get_dual_values()
	#print hp.get_eff_passes()
	#print hp.get_train_errors()

if __name__ == '__main__':
	print("Factor Graph Model")
	structure_factor_graph_model(*parameter_list[0])

examples/documented/python_modular/structure_graphcuts.py

#!/usr/bin/env python

import numpy as np
import itertools

from modshogun import Factor, TableFactorType, FactorGraph
from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
from modshogun import FactorGraphModel, GRAPH_CUT
from modshogun import GraphCut
from modshogun import StochasticSOSVM

def generate_data(num_train_samples, len_label, len_feat):
    """ Generate synthetic dataset

        Generate random data following [1]:
        Each example has exactly one label on.
        Each label has 40 related binary features.
        For an example, if label i is on, 4i randomly chosen features are set to 1

        [1] Finley, Thomas, and Thorsten Joachims.
        "Training structural SVMs when exact inference is intractable."
        Proceedings of the 25th international conference on Machine learning. ACM, 2008.

        Args:
            num_train_samples: number of samples
            len_label: label length (10)
            len_feat: feature length (40)

        Returns:
            feats: generated feature matrix
            labels: generated label matrix
    """

    labels = np.zeros((num_train_samples, len_label), np.int32)
    feats = np.zeros((num_train_samples, len_feat), np.int32)

    for k in range(num_train_samples):
        i = k % len_label
        labels[k, i] = 1
        inds_one = np.random.permutation(range(len_feat))
        inds_one = inds_one[:4*(i+1)]
        for j in inds_one:
            feats[k, j] = 1

    return (labels, feats)

def define_factor_types(num_vars, len_feat, edge_table):
    """ Define factor types

        Args:
            num_vars: number of variables in factor graph
            len_feat: length of the feature vector
            edge_table: edge table defines pair-wise node indeces

        Returns:
            v_factor_types: list of all unary and pair-wise factor types
    """
    n_stats = 2 # for binary status
    v_factor_types = {}
    n_edges = edge_table.shape[0]

    # unary factors
    cards_u = np.array([n_stats], np.int32)
    w_u = np.zeros(n_stats*len_feat)
    for i in range(num_vars):
        v_factor_types[i] = TableFactorType(i, cards_u, w_u)

    # pair-wise factors
    cards_pw = np.array([n_stats, n_stats], np.int32)
    w_pw = np.zeros(n_stats*n_stats)
    for j in range(n_edges):
        v_factor_types[j + num_vars] = TableFactorType(j + num_vars, cards_pw, w_pw)

    return v_factor_types

def build_factor_graph_model(labels, feats, factor_types, edge_table, infer_alg = GRAPH_CUT):
    """ Build factor graph model

        Args:
            labels: matrix of labels [num_train_samples*len_label]
            feats: maxtrix of feats [num_train_samples*len_feat]
            factory_types: vectors of all factor types
            edge_table: matrix of pairwised edges, each row is a pair of node indeces
            infer_alg: inference algorithm (GRAPH_CUT)

        Returns:
            labels_fg: matrix of labels in factor graph format
            feats_fg: matrix of features in factor graph format
    """

    labels = labels.astype(np.int32)
    num_train_samples = labels.shape[0]
    num_vars = labels.shape[1]
    num_edges = edge_table.shape[0]
    n_stats = 2

    feats_fg = FactorGraphFeatures(num_train_samples)
    labels_fg = FactorGraphLabels(num_train_samples)

    for i in range(num_train_samples):
        cardinaities = np.array([n_stats]*num_vars, np.int32)
        fg = FactorGraph(cardinaities)

        # add unary factors
        for u in range(num_vars):
            data_u = np.array(feats[i,:], np.float64)
            inds_u = np.array([u], np.int32)
            factor_u = Factor(factor_types[u], inds_u, data_u)
            fg.add_factor(factor_u)

        # add pairwise factors
        for v in range(num_edges):
            data_p = np.array([1.0])
            inds_p = np.array(edge_table[v, :], np.int32)
            factor_p = Factor(factor_types[v + num_vars], inds_p, data_p)
            fg.add_factor(factor_p)

        # add factor graph
        feats_fg.add_sample(fg)

        # add corresponding label
        loss_weights = np.array([1.0/num_vars]*num_vars)
        fg_obs = FactorGraphObservation(labels[i,:], loss_weights)
        labels_fg.add_label(fg_obs)

    return (labels_fg, feats_fg)

def evaluation(labels_pr, labels_gt, model):
    """ Evaluation

        Args:
            labels_pr: predicted label
            labels_gt: ground truth label
            model: factor graph model

        Returns:
            ave_loss: average loss
    """
    num_train_samples = labels_pr.get_num_labels()
    acc_loss = 0.0
    ave_loss = 0.0
    for i in range(num_train_samples):
        y_pred = labels_pr.get_label(i)
        y_truth = labels_gt.get_label(i)
        acc_loss = acc_loss + model.delta_loss(y_truth, y_pred)

    ave_loss = acc_loss / num_train_samples

    return ave_loss

def graphcuts_sosvm(num_train_samples = 10, len_label = 5, len_feat = 20, num_test_samples = 5):
    """ Graph cuts as approximate inference in structured output SVM framework.

        Args:
            num_train_samples: number of training samples
            len_label: number of classes, i.e., size of label space
            len_feat: the dimension of the feature vector
            num_test_samples: number of testing samples
    """
    import time

    # generate synthetic dataset
    (labels_train, feats_train) = generate_data(num_train_samples, len_label, len_feat)

    # compute full-connected edge table
    full = np.vstack([x for x in itertools.combinations(range(len_label), 2)])

    # define factor types
    factor_types = define_factor_types(len_label, len_feat, full)

    # create features and labels for factor graph mode
    (labels_fg, feats_fg) = build_factor_graph_model(labels_train, feats_train, factor_types, full, GRAPH_CUT)

    # create model and register factor types
    model = FactorGraphModel(feats_fg, labels_fg, GRAPH_CUT)

    for i in range(len(factor_types)):
        model.add_factor_type(factor_types[i])

    # Training
    # the 3rd parameter is do_weighted_averaging, by turning this on,
    # a possibly faster convergence rate may be achieved.
    # the 4th parameter controls outputs of verbose training information
    sgd = StochasticSOSVM(model, labels_fg, True, True)
    sgd.set_num_iter(150)
    sgd.set_lambda(0.0001)

    # train
    t0 = time.time()
    sgd.train()
    t1 = time.time()
    w_sgd = sgd.get_w()
    #print "SGD took", t1 - t0, "seconds."

    # training error
    labels_pr = sgd.apply()
    ave_loss = evaluation(labels_pr, labels_fg, model)
    #print('SGD: Average training error is %.4f' % ave_loss)

    # testing error
    # generate synthetic testing dataset
    (labels_test, feats_test) = generate_data(num_test_samples, len_label, len_feat)
    # create features and labels for factor graph mode
    (labels_fg_test, feats_fg_test) = build_factor_graph_model(labels_test, feats_test, factor_types, full, GRAPH_CUT)
    # set features and labels to sgd
    sgd.set_features(feats_fg_test)
    sgd.set_labels(labels_fg_test)
    # test
    labels_pr = sgd.apply()
    ave_loss = evaluation(labels_pr, labels_fg_test, model)
    #print('SGD: Average testing error is %.4f' % ave_loss)

def graphcuts_general():
    """ Graph cuts for general s-t graph optimization.
    """

    num_nodes = 5
    num_edges = 6

    g = GraphCut(num_nodes, num_edges)

    # add termainal-connected edges
    # i.e., SOURCE->node_i and node_i->SINK
    g.add_tweights(0, 4, 0)
    g.add_tweights(1, 2, 0)
    g.add_tweights(2, 8, 0)
    g.add_tweights(2, 0, 4)
    g.add_tweights(3, 0, 7)
    g.add_tweights(4, 0, 5)

    # add node to node edges
    g.add_edge(0, 2, 5, 0)
    g.add_edge(0, 3, 2, 0)
    g.add_edge(1, 2, 6, 0)
    g.add_edge(1, 4, 9, 0)
    g.add_edge(2, 3, 1, 0)
    g.add_edge(2, 4, 3, 0)

    # initialize max-flow algorithm
    g.init_maxflow()

    # compute max flow
    flow = g.compute_maxflow()
    #print("Flow = %f" % flow)

    # print assignment
    #for i in xrange(num_nodes):
    #    print("\nNode %d = %d" % (i, g.get_assignment(i)))

test_general = True
test_sosvm = True
parameter_list = [[test_general, test_sosvm]]

def structure_graphcuts(test_general=True, test_sosvm=True):
    """ Test graph cuts.

        Args:
            test_general: test graph cuts for general s-t graph optimization
            test_sosvm: test graph cuts for structured output svm
    """

    if test_general:
        graphcuts_general()

    if test_sosvm:
        graphcuts_sosvm()

if __name__ == '__main__':
    print("Graph cuts")
    structure_graphcuts(*parameter_list[0])

examples/documented/python_modular/structure_hierarchical_multilabel_classification.py

#!/usr/bin/env python

"""
This examples shows how to use HierarchicalMultilabelModel for hierarchical
multi-label classification. The data used:
[1] Image CLEF 2007 competition for annotation of X-Ray images.
    http://kt.ijs.si/DragiKocev/PhD/resources/doku.php?id=hmc_classification#imageclef07d
"""

from modshogun import MultilabelSOLabels, HierarchicalMultilabelModel
from modshogun import RealFeatures
from modshogun import StochasticSOSVM
from modshogun import StructuredAccuracy, LabelsFactory
import numpy as np
import time


train_file_name = '../../../data/multilabel/image_clef_train.arff'
test_file_name = '../../../data/multilabel/image_clef_test.arff'

parameter_list = [[train_file_name, test_file_name]]


def get_taxonomy(labels):
    """
    Converting the labels to shogun compatible format
    (i.e. 0, 1, ... num_classes - 1) and getting taxonomy of the labels
    """
    labels = labels.split(',')
    num_labels = len(labels)
    # taking the root label into consideration
    num_labels += 1
    shogun_labels = dict()
    taxonomy = np.zeros(num_labels, dtype=np.int32)
    # considering the root_label node index to be 0
    taxonomy[0] = -1
    for i, label in enumerate(labels):
        shogun_labels[label] = i + 1
        try:
            parent_label = label[:-2]
            parent_idx = labels.index(parent_label) + 1
            taxonomy[i + 1] = parent_idx
        except ValueError:
            taxonomy[i + 1] = 0
    return shogun_labels, taxonomy


def get_data_sample(data_sample, shogun_labels):
    """
    Extracting features and labels from a single row of data
    """
    data = data_sample.split(',')
    features = np.array(data[:-1], dtype=np.float64)
    labs = data[-1].split('@')
    # adding the root label
    labels = np.zeros(len(labs) + 1, dtype=np.int32)
    labels[0] = 0
    for i, label in enumerate(labs):
        labels[i + 1] = shogun_labels[label]
    labels.sort()

    return features, labels


def get_data(data, shogun_labels):
    """
    Creating features and labels from the data samples
    """
    num_samples = len(data)
    # considering the root label
    num_classes = len(shogun_labels) + 1
    labels = MultilabelSOLabels(num_samples, num_classes)

    for i, data_sample in enumerate(data):
        feats, labs = get_data_sample(data_sample, shogun_labels)
        try:
            features = np.c_[features, feats]
        except NameError:
            features = feats
        labels.set_sparse_label(i, labs)

    return RealFeatures(features), labels


def get_features_labels(input_file):
    """
    Creating features and labels from the input file (train/test file)
    """
    train_file_lines = list(map(lambda x: x.strip(), input_file.readlines()))

    all_labels = list(filter(lambda x: 'hierarchical' in x.strip(),
                             train_file_lines))[0].split()[-1]

    shogun_labels, taxonomy = get_taxonomy(all_labels)

    data_index = train_file_lines.index('@DATA')
    features, labels = get_data(train_file_lines[data_index + 1:],
                                shogun_labels)

    return features, labels, taxonomy


def structure_hierarchical_multilabel_classification(train_file_name,
                                                     test_file_name):
    train_file = open(train_file_name)
    test_file = open(test_file_name)

    train_features, train_labels, train_taxonomy = get_features_labels(
        train_file)

    model = HierarchicalMultilabelModel(train_features, train_labels,
                                        train_taxonomy)
    sgd = StochasticSOSVM(model, train_labels)
    t1 = time.time()
    sgd.train()
    print('>>> Took %f time for training' % (time.time() - t1))

    test_features, test_labels, test_taxonomy = get_features_labels(test_file)
    assert(test_taxonomy.all() == train_taxonomy.all())

    evaluator = StructuredAccuracy()
    outlabel = LabelsFactory.to_structured(sgd.apply(test_features))

    print('>>> Accuracy of classification = %f' % evaluator.evaluate(
        outlabel, test_labels))

if __name__ == '__main__':
    print('Hierarchical Multilabel Classification')
    structure_hierarchical_multilabel_classification(*parameter_list[0])

examples/documented/python_modular/structure_multiclass_bmrm.py

#!/usr/bin/env python

import numpy as np

def gen_data(num_classes,num_samples,dim):
	np.random.seed(0)
	covs = np.array([[[0., -1. ], [2.5,  .7]],
			 [[3., -1.5], [1.2, .3]],
			 [[ 2,  0  ], [ .0,  1.5 ]]])
	X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]),
		  np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]),
		  np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])];
	Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples)))
	return X, Y

# Number of classes
M = 3
# Number of samples of each class
N = 50
# Dimension of the data
dim = 2

traindat, label_traindat = gen_data(M,N,dim)

parameter_list = [[traindat,label_traindat]]

def structure_multiclass_bmrm(fm_train_real=traindat,label_train_multiclass=label_traindat):
	from modshogun import MulticlassSOLabels, LabelsFactory
	from modshogun import RealFeatures
	from modshogun import SOSVMHelper
	try:
		from modshogun import BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM
	except ImportError:
		print("At least one of BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM not available")
		exit(0)
	from modshogun import MulticlassModel, RealNumber

	labels = MulticlassSOLabels(label_train_multiclass)
	features = RealFeatures(fm_train_real.T)

	model = MulticlassModel(features, labels)
	sosvm = DualLibQPBMSOSVM(model, labels, 1.0)

	# BMRM
	sosvm.set_solver(BMRM)
	sosvm.set_verbose(True)
	sosvm.train()

	bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
	count = 0
	for i in range(bmrm_out.get_num_labels()):
		yi_pred = RealNumber.obtain_from_generic(bmrm_out.get_label(i))
		if yi_pred.value == label_train_multiclass[i]:
			count = count + 1

	#print("BMRM: Correct classification rate: %0.2f" % ( 100.0*count/bmrm_out.get_num_labels() ))
	#hp = sosvm.get_helper()
	#print hp.get_primal_values()
	#print hp.get_train_errors()

	# PPBMRM
	w = np.zeros(model.get_dim())
	sosvm.set_w(w)
	sosvm.set_solver(PPBMRM)
	sosvm.set_verbose(True)
	sosvm.train()

	ppbmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
	count = 0
	for i in range(ppbmrm_out.get_num_labels()):
		yi_pred = RealNumber.obtain_from_generic(ppbmrm_out.get_label(i))
		if yi_pred.value == label_train_multiclass[i]:
			count = count + 1

	#print("PPBMRM: Correct classification rate: %0.2f" % ( 100.0*count/ppbmrm_out.get_num_labels() ))

	# P3BMRM
	w = np.zeros(model.get_dim())
	sosvm.set_w(w)
	sosvm.set_solver(P3BMRM)
	sosvm.set_verbose(True)
	sosvm.train()

	p3bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply())
	count = 0
	for i in range(p3bmrm_out.get_num_labels()):
		yi_pred = RealNumber.obtain_from_generic(p3bmrm_out.get_label(i))
		if yi_pred.value == label_train_multiclass[i]:
			count = count + 1

	#print("P3BMRM: Correct classification rate: %0.2f" % ( 100.0*count/p3bmrm_out.get_num_labels() ))
	return bmrm_out, ppbmrm_out, p3bmrm_out

if __name__=='__main__':
	print('SO multiclass model with bundle methods')
	a,b,c=structure_multiclass_bmrm(*parameter_list[0])

examples/documented/python_modular/structure_plif_hmsvm_bmrm.py

#!/usr/bin/env python

parameter_list=[[50, 125, 10, 2]]

def structure_plif_hmsvm_bmrm (num_examples, example_length, num_features, num_noise_features):
	from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy
	try:
		from modshogun import DualLibQPBMSOSVM
	except ImportError:
		print("DualLibQPBMSOSVM not available")
		exit(0)

	model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features)
	sosvm = DualLibQPBMSOSVM(model, model.get_labels(), 5000.0)
	sosvm.set_store_train_info(False)

	sosvm.train()
	#print sosvm.get_w()

	predicted = sosvm.apply(model.get_features())
	evaluator = StructuredAccuracy()
	acc = evaluator.evaluate(predicted, model.get_labels())
	#print('Accuracy = %.4f' % acc)

if __name__ == '__main__':
	print("PLiF HMSVM BMRM")
	structure_plif_hmsvm_bmrm(*parameter_list[0])

examples/documented/python_modular/structure_plif_hmsvm_mosek.py

#!/usr/bin/env python

parameter_list=[[100, 250, 10, 2]]

def structure_plif_hmsvm_mosek (num_examples, example_length, num_features, num_noise_features):
	from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy

	try:
		from modshogun import PrimalMosekSOSVM
	except ImportError:
		print("Mosek not available")
		return

	model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features)
	sosvm = PrimalMosekSOSVM(model, model.get_labels())

	sosvm.train()
	#print(sosvm.get_w())

	predicted = sosvm.apply(model.get_features())
	evaluator = StructuredAccuracy()
	acc = evaluator.evaluate(predicted, model.get_labels())
	#print('Accuracy = %.4f' % acc)

if __name__ == '__main__':
	print("PLiF HMSVM Mosek")
	structure_plif_hmsvm_mosek(*parameter_list[0])

Tests

examples/documented/python_modular/tests_check_commwordkernel_memleak_modular.py

#!/usr/bin/env python
parameter_list=[[10,7,0,False]]

def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse):
	import gc
	from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from modshogun import SortWordString, MSG_DEBUG
	from modshogun import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT',
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT',
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in range(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preprocessor(pre)
		trainudat.apply_preprocessor()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K

if __name__=='__main__':
	print('Leak Check Comm Word Kernel')
	tests_check_commwordkernel_memleak_modular(*parameter_list[0])

Transfer

examples/documented/python_modular/transfer_multitask_clustered_logistic_regression.py

#!/usr/bin/env python
from numpy import array,hstack,sin,cos
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
	try:
		from modshogun import MultitaskClusteredLogisticRegression
	except ImportError:
		print("MultitaskClusteredLogisticRegression not available")
		exit()

	features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat))))
	labels = BinaryLabels(hstack((label_train,label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//3)
	task_two = Task(n_vectors//3,2*n_vectors//3)
	task_three = Task(2*n_vectors//3,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)
	task_group.append_task(task_three)

	mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2)
	#mtlr.io.set_loglevel(MSG_DEBUG)
	mtlr.set_tolerance(1e-3) # use 1e-2 tolerance
	mtlr.set_max_iter(100)
	mtlr.train()
	mtlr.set_current_task(0)
	#print mtlr.get_w()
	out = mtlr.apply_regression().get_labels()

	return out

if __name__=='__main__':
	print('TransferMultitaskClusteredLogisticRegression')
	transfer_multitask_clustered_logistic_regression(*parameter_list[0])

examples/documented/python_modular/transfer_multitask_l12_logistic_regression.py

#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskL12LogisticRegression
	except ImportError:
		print("MultitaskL12LogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group)
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.set_max_iter(10)
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply_regression().get_labels()

	return out

if __name__=='__main__':
	print('TransferMultitaskL12LogisticRegression')
	transfer_multitask_l12_logistic_regression(*parameter_list[0])

examples/documented/python_modular/transfer_multitask_leastsquares_regression.py

#!/usr/bin/env python
from numpy import array
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskLeastSquaresRegression
	except ImportError:
		print("MultitaskLeastSquaresRegression not available")
		exit(0)

	features = RealFeatures(traindat)
	labels = RegressionLabels(label_train)

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group)
	mtlsr.set_regularization(1) # use regularization ratio
	mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlsr.train()
	mtlsr.set_current_task(0)
	out = mtlsr.apply_regression().get_labels()
	return out

if __name__=='__main__':
	print('TransferMultitaskLeastSquaresRegression')
	transfer_multitask_leastsquares_regression(*parameter_list[0])

examples/documented/python_modular/transfer_multitask_logistic_regression.py

#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskLogisticRegression
	except ImportError:
		print("MultitaskLogisticRegression not available")
		exit()

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group)
	mtlr.set_regularization(1) # use regularization ratio
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply().get_labels()

	return out

if __name__=='__main__':
	print('TransferMultitaskLogisticRegression')
	transfer_multitask_logistic_regression(*parameter_list[0])

examples/documented/python_modular/transfer_multitask_trace_logistic_regression.py

#!/usr/bin/env python
from numpy import array,hstack
from numpy.random import seed, rand
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat]]

def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from modshogun import MultitaskTraceLogisticRegression
	except ImportError:
		print("MultitaskTraceLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group)
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.set_max_iter(10)
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply_regression().get_labels()

	return out

if __name__=='__main__':
	print('TransferMultitaskTraceLogisticRegression')
	transfer_multitask_trace_logistic_regression(*parameter_list[0])

Variational

examples/documented/python_modular/variational_classifier_modular.py

#!/usr/bin/env python

#
# Copyright (c) The Shogun Machine Learning Toolbox
# Written (w) 2014 Wu Lin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are those
# of the authors and should not be interpreted as representing official policies,
# either expressed or implied, of the Shogun Development Team.
#
#

path='../data'
traindat = '%s/fm_train_real.dat'%path
testdat = '%s/fm_test_real.dat'%path
label_binary_traindat = '%s/label_train_twoclass.dat'%path

try:
	from modshogun import GaussianProcessClassification
except ImportError:
	print("GaussianProcessClassification is not available")
	exit(0)

from modshogun import *
parameter_list=[
	[KLCholeskyInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
	[KLCovarianceInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
	[KLDiagonalInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
	[KLDualInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0],
	[SingleLaplaceInferenceMethod,traindat,testdat,label_binary_traindat,0,0],
]
def variational_classifier_modular(kl_inference,train_fname=traindat,test_fname=testdat,
	label_fname=label_binary_traindat,kernel_log_sigma=0,kernel_log_scale=0,noise_factor=1e-5,
	min_coeff_kernel=1e-2,max_attempt=0):
	from math import exp
	features_train=RealFeatures(CSVFile(train_fname))
	labels_train=BinaryLabels(CSVFile(label_fname))

	likelihood=LogitDVGLikelihood()
	error_eval=ErrorRateMeasure()
	mean_func=ConstMean()
	kernel_sigma=2*exp(2*kernel_log_sigma);
	kernel_func=GaussianKernel(10, kernel_sigma)

	inf=kl_inference(kernel_func, features_train, mean_func, labels_train, likelihood)
	try:
		inf.set_noise_factor(noise_factor)
		inf.set_min_coeff_kernel(min_coeff_kernel)
		inf.set_max_attempt(max_attempt)
	except:
		pass
	inf.set_scale(exp(kernel_log_scale))
	gp=GaussianProcessClassification(inf)
	gp.train()
	pred_labels_train=gp.apply_binary(features_train)
	error_train=error_eval.evaluate(pred_labels_train, labels_train)
	#print "\nInference name:%s"%inf.get_name(),
	#print "marginal likelihood:%.10f"%inf.get_negative_log_marginal_likelihood(),
	#print "Training error %.4f"%error_train
	return pred_labels_train, gp, pred_labels_train.get_labels()


if __name__=="__main__":
	print("variational_classifier")
	for parameter in parameter_list:
		variational_classifier_modular(*parameter)