SHOGUN
4.2.0
|
This page lists ready to run shogun examples for the Python Modular interface.
To run the examples issue
python name_of_example.py
# In this example the Averaged Perceptron used to classify toy data. #!/usr/bin/env python from numpy import * parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]] def classifier_averaged_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1): from modshogun import RealFeatures, BinaryLabels from modshogun import AveragedPerceptron random.seed(seed) # produce some (probably) linearly separable training data by hand # Two Gaussians at a far enough distance X=array(random.randn(dim,n))+distance Y=array(random.randn(dim,n))-distance X_test=array(random.randn(dim,n))+distance Y_test=array(random.randn(dim,n))-distance label_train_twoclass=hstack((ones(n), -ones(n))) #plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o') fm_train_real=hstack((X,Y)) fm_test_real=hstack((X_test,Y_test)) feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=BinaryLabels(label_train_twoclass) perceptron=AveragedPerceptron(feats_train, labels) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) # only guaranteed to converge for separable data perceptron.train() perceptron.set_features(feats_test) out_labels = perceptron.apply().get_labels() return perceptron, out_labels if __name__=='__main__': print('AveragedPerceptron') classifier_averaged_perceptron_modular(*parameter_list[0])
# This example shows how to use a custom defined kernel function for training a # two class Support Vector Machine (SVM) classifier on randomly generated # examples. The SVM regularization constant is set to C=1. #!/usr/bin/env python parameter_list = [[1,7],[2,8]] def classifier_custom_kernel_modular (C=1,dim=7): from modshogun import RealFeatures, BinaryLabels, CustomKernel, LibSVM from numpy import diag,ones,sign from numpy.random import rand,seed seed((C,dim)) lab=sign(2*rand(dim) - 1) data=rand(dim, dim) symdata=data*data.T + diag(ones(dim)) kernel=CustomKernel() kernel.set_full_kernel_matrix_from_full(data) labels=BinaryLabels(lab) svm=LibSVM(C, kernel, labels) svm.train() predictions =svm.apply() out=svm.apply().get_labels() return svm,out if __name__=='__main__': print('custom_kernel') classifier_custom_kernel_modular(*parameter_list[0])
# In this example we demonstrate how to use SVMs in a domain adaptation # scenario. Here, we assume that we have two problem domains, one with # an abundance of training data (source domain) and one with only a few # training examples (target domain). These domains are assumed to be # different but related enough to transfer information between them. # Thus, we first train an SVM on the source domain and then subsequently # pass this previously trained SVM object to the DASVM, that we train # on the target domain. The DASVM internally computes a custom linear term # (for the underlying quadratic program of the dual formulation of the SVM) # based on the support vectors of the source SVM and the training examples # of the target SVM. Finally, it can be used for prediction just as any other # SVM object. # #!/usr/bin/env python import numpy from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel from modshogun import MSG_DEBUG try: from modshogun import DomainAdaptationSVM except ImportError: print("DomainAdaptationSVM not available") exit(0) try: from modshogun import SVMLight except ImportError: print("SVMLight not available") exit(0) traindna = ['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_traindna = numpy.array(5*[-1.0] + 5*[1.0]) testdna = ['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_testdna = numpy.array(5*[-1.0] + 5*[1.0]) traindna2 = ['AGACAGTCAGTCGATAGCT', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAACACGTTCTCTCGTC', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGTCGTTTTTTTAGTC', 'AGCAGTCGTAGTCGAAAAC', 'ACCCCCCCCCCCCCCCCTC', 'AGCAGGGGGGGGGGGAGTC'] label_traindna2 = numpy.array(5*[-1.0] + 5*[1.0]) testdna2 = ['CGACAGTCAGTCGATAGCT', 'CGCAGTCGTAGTCGTAGTC', 'ACCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCCACACGTTCTCTCGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAGTGGGGTTTTTAGTC', 'AGCAGTCGTAAACGAAAAC', 'ACCCCCCCCCCCCAACCTC', 'AGCAGGAAGGGGGGGAGTC'] label_testdna2 = numpy.array(5*[-1.0] + 5*[1.0]) parameter_list = [[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \ testdna2,label_testdna2,1,3],[traindna,testdna,label_traindna,label_testdna,traindna2,label_traindna2, \ testdna2,label_testdna2,2,5]] def classifier_domainadaptationsvm_modular (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO if __name__=='__main__': print('SVMLight') classifier_domainadaptationsvm_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array,hstack from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup try: from modshogun import FeatureBlockLogisticRegression except ImportError: print("FeatureBlockLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_features = features.get_num_features() block_one = IndexBlock(0,n_features//2) block_two = IndexBlock(n_features//2,n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out if __name__=='__main__': print('FeatureBlockLogisticRegression') classifier_featureblock_logistic_regression(*parameter_list[0])
# In this example a multi-class support vector machine is trained on a toy data # set and the trained classifier is then used to predict labels of test # examples. The training algorithm is based on BSVM formulation (L2-soft margin # and the bias added to the objective function) which is solved by the Improved # Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian # kernel of width 2.1 and the regularization constant C=1. The solver stops if the # relative duality gap falls below 1e-5. # # For more details on the used SVM solver see # V.Franc: Optimization Algorithms for Kernel Methods. Research report. # CTU-CMP-2005-22. CTU FEL Prague. 2005. # ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf . # #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]] def classifier_gmnpsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel, GMNPSVM, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=MulticlassLabels(CSVFile(label_fname)) kernel=GaussianKernel(feats_train, feats_train, width) svm=GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train(feats_train) out=svm.apply(feats_test).get_labels() return out,kernel if __name__=='__main__': print('GMNPSVM') classifier_gmnpsvm_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is then used to predict labels of test # examples. As training algorithm Gradient Projection Decomposition Technique # (GPDT) is used with SVM regularization parameter C=1 and a Gaussian # kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution. # # For more details on GPDT solver see http://dm.unife.it/gpdt . # #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]] def classifier_gpbtsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, BinaryLabels from modshogun import GaussianKernel from modshogun import CSVFile try: from modshogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) kernel=GaussianKernel(feats_train, feats_train, width) svm=GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('GPBTSVM') classifier_gpbtsvm_modular(*parameter_list[0])
# In this example a multi-class support vector machine classifier is trained on a # toy data set and the trained classifier is then used to predict labels of test # examples. As training algorithm the LaRank algorithm is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision # set to epsilon=1e-5. # # For more details on LaRank see # Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J. # Solving MultiClass Support Vector Machines with LaRank. ICML 2007. # #!/usr/bin/env python from numpy import * parameter_list = [[10,3,15,0.9,1,2000,1],[20,4,15,0.9,1,5000,2]] def classifier_larank_modular (num_vec,num_class,distance,C=0.9,num_threads=1,num_iter=5,seed=1): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LaRank from modshogun import Math_init_random # reproducible results Math_init_random(seed) random.seed(seed) # generate some training data where each class pair is linearly separable label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") fm_train=array(random.randn(num_class,num_vec)) fm_test=array(random.randn(num_class,num_vec)) for i in range(len(label_train)): fm_train[label_train[i],i]+=distance fm_test[label_test[i],i]+=distance feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) epsilon=1e-5 labels=MulticlassLabels(label_train) svm=LaRank(C, kernel, labels) #svm.set_tau(1e-3) svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out=svm.apply(feats_test).get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels() if __name__=='__main__': print('LaRank') [predictions, svm, labels] = classifier_larank_modular(*parameter_list[0])
# In this example a two-class linear classifier based on the Linear Discriminant # Analysis (LDA) is trained on a toy data set and then the trained classifier is # used to predict test examples. The regularization parameter, which corresponds # to a weight of a unitary matrix added to the covariance matrix, is set to # gamma=3. # # For more details on the LDA see e.g. # http://en.wikipedia.org/wiki/Linear_discriminant_analysis #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,3,1],[traindat,testdat,label_traindat,4,1]] def classifier_lda_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,gamma=3,num_threads=1): from modshogun import RealFeatures, BinaryLabels, LDA, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) lda=LDA(gamma, feats_train, labels) lda.train() bias=lda.get_bias() w=lda.get_w() predictions = lda.apply(feats_test).get_labels() return lda,predictions if __name__=='__main__': print('LDA') classifier_lda_modular(*parameter_list[0])
# In this example a one-class support vector machine classifier is trained on a # toy data set. The training algorithm finds a hyperplane in the RKHS which # separates the training data from the origin. The one-class classifier is # typically used to estimate the support of a high-dimesnional distribution. # For more details see e.g. # B. Schoelkopf et al. Estimating the support of a high-dimensional # distribution. Neural Computation, 13, 2001, 1443-1471. # # In the example, the one-class SVM is trained by the LIBSVM solver with the # regularization parameter C=1 and the Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,2.2,1,1e-7],[traindat,testdat,2.1,1,1e-5]] def classifier_libsvmoneclass_modular (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('LibSVMOneClass') classifier_libsvmoneclass_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Minimal Primal Dual SVM is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 1.2 and the # precision parameter 1e-5. # # For more details on the MPD solver see # Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple # Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell, # J. G., J. Siekmann, Springer, Berlin, Germany (11 2005) #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,1,1e-5],[traindat,testdat,label_traindat,0.9,1e-5]] def classifier_mpdsvm_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=1,epsilon=1e-5): from modshogun import RealFeatures, BinaryLabels from modshogun import GaussianKernel from modshogun import MPDSVM, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) svm=MPDSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('MPDSVM') classifier_mpdsvm_modular(*parameter_list[0])
#!/usr/bin/env python import re import time from tools.multiclass_shared import prepare_data # run with toy data [traindat, label_traindat, testdat, label_testdat] = prepare_data() # run with opt-digits if available #[traindat, label_traindat, testdat, label_testdat] = prepare_data(False) parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5]] def classifier_multiclass_ecoc (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,lawidth=2.1,C=1,epsilon=1e-5): import modshogun from modshogun import ECOCStrategy, LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine from modshogun import MulticlassAccuracy from modshogun import RealFeatures, MulticlassLabels def nonabstract_class(name): try: getattr(modshogun, name)() except TypeError: return False return True encoders = [x for x in dir(modshogun) if re.match(r'ECOC.+Encoder', x) and nonabstract_class(x)] decoders = [x for x in dir(modshogun) if re.match(r'ECOC.+Decoder', x) and nonabstract_class(x)] fea_train = RealFeatures(fm_train_real) fea_test = RealFeatures(fm_test_real) gnd_train = MulticlassLabels(label_train_multiclass) if label_test_multiclass is None: gnd_test = None else: gnd_test = MulticlassLabels(label_test_multiclass) base_classifier = LibLinear(L2R_L2LOSS_SVC) base_classifier.set_bias_enabled(True) #print('Testing with %d encoders and %d decoders' % (len(encoders), len(decoders))) #print('-' * 70) #format_str = '%%15s + %%-10s %%-10%s %%-10%s %%-10%s' #print((format_str % ('s', 's', 's')) % ('encoder', 'decoder', 'codelen', 'time', 'accuracy')) def run_ecoc(ier, idr): encoder = getattr(modshogun, encoders[ier])() decoder = getattr(modshogun, decoders[idr])() # whether encoder is data dependent if hasattr(encoder, 'set_labels'): encoder.set_labels(gnd_train) encoder.set_features(fea_train) strategy = ECOCStrategy(encoder, decoder) classifier = LinearMulticlassMachine(strategy, fea_train, base_classifier, gnd_train) classifier.train() label_pred = classifier.apply(fea_test) if gnd_test is not None: evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, gnd_test) else: acc = None return (classifier.get_num_machines(), acc) for ier in range(len(encoders)): for idr in range(len(decoders)): t_begin = time.clock() (codelen, acc) = run_ecoc(ier, idr) if acc is None: acc_fmt = 's' acc = 'N/A' else: acc_fmt = '.4f' t_elapse = time.clock() - t_begin #print((format_str % ('d', '.3f', acc_fmt)) % # (encoders[ier][4:-7], decoders[idr][4:-7], codelen, t_elapse, acc)) if __name__=='__main__': print('MulticlassECOC') classifier_multiclass_ecoc(*parameter_list[0])
#!/usr/bin/env python from tools.multiclass_shared import prepare_data [traindat, label_traindat, testdat, label_testdat] = prepare_data(False) parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]] def classifier_multiclassliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import MulticlassLibLinear feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C,feats_train,labels) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out if __name__=='__main__': print('MulticlassLibLinear') classifier_multiclassliblinear_modular(*parameter_list[0])
#!/usr/bin/env python from tools.multiclass_shared import prepare_data [traindat, label_traindat, testdat, label_testdat] = prepare_data() parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]] def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out if __name__=='__main__': print('MulticlassMachine') classifier_multiclassmachine_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import * parameter_list = [[10,3,15,2.1,1,1e-5,1],[20,4,15,2.2,2,1e-5,2]] def classifier_multiclassocas_modular (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1): from modshogun import RealFeatures, MulticlassLabels from modshogun import Math_init_random try: from modshogun import MulticlassOCAS except ImportError: print("MulticlassOCAS not available") return # reproducible results random.seed(seed) Math_init_random(seed) # generate some training data where each class pair is linearly separable label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") fm_train=array(random.randn(num_class,num_vec)) fm_test=array(random.randn(num_class,num_vec)) for i in range(len(label_train)): fm_train[label_train[i],i]+=distance fm_test[label_test[i],i]+=distance feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) labels=MulticlassLabels(label_train) classifier = MulticlassOCAS(C,feats_train,labels) classifier.train() out = classifier.apply(feats_test).get_labels() #print label_test #print out return out,classifier if __name__=='__main__': print('MulticlassOCAS') classifier_multiclassocas_modular(*parameter_list[0])
#!/usr/bin/env python from tools.multiclass_shared import prepare_data [traindat, label_traindat, testdat, label_testdat] = prepare_data(False) parameter_list = [[traindat,testdat,label_traindat,label_testdat,2.1,1,1e-5],[traindat,testdat,label_traindat,label_testdat,2.2,1,1e-5]] def classifier_multilabeloutputliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels, MultilabelLabels from modshogun import MulticlassLibLinear feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C,feats_train,labels) classifier.train() label_pred = classifier.apply_multilabel_output(feats_test,2) out = label_pred.get_labels() #print out return out if __name__=='__main__': print('MultilabelOutputLibLinear') classifier_multilabeloutputliblinear_modular(*parameter_list[0])
# This example shows usage of the Perceptron algorithm for training a two-class # linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm works by # iteratively passing though the training examples and applying the update rule on # those examples which are misclassified by the current classifier. The Perceptron # update rule reads # # w(t+1) = w(t) + alpha * y_t * x_t # b(t+1) = b(t) + alpha * y_t # # where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example # (w(t),b(t)) are the current parameters of the linear classifier # (w(t+1),b(t+1)) are the new parameters of the linear classifier # alpha is the learning rate; in this examples alpha=1 # # The Perceptron algorithm iterates until all training examples are correctly # classified or the prescribed maximal number of iterations, in this example # max_iter=1000, is reached. #!/usr/bin/env python from numpy import * parameter_list = [[100, 2, 5,1.,1000,1,1], [100, 2, 5,1.,1000,1,2]] def classifier_perceptron_modular (n=100, dim=2, distance=5,learn_rate=1.,max_iter=1000,num_threads=1,seed=1): from modshogun import RealFeatures, BinaryLabels from modshogun import Perceptron random.seed(seed) # produce some (probably) linearly separable training data by hand # Two Gaussians at a far enough distance X=array(random.randn(dim,n))+distance Y=array(random.randn(dim,n))-distance X_test=array(random.randn(dim,n))+distance Y_test=array(random.randn(dim,n))-distance label_train_twoclass=hstack((ones(n), -ones(n))) #plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o') fm_train_real=hstack((X,Y)) fm_test_real=hstack((X_test,Y_test)) feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=BinaryLabels(label_train_twoclass) perceptron=Perceptron(feats_train, labels) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) # only guaranteed to converge for separable data perceptron.train() perceptron.set_features(feats_test) out_labels = perceptron.apply().get_labels() return perceptron, out_labels if __name__=='__main__': print('Perceptron') classifier_perceptron_modular(*parameter_list[0])
# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2014 Soumyajit De # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,1,5,0.9]] def classifier_ssk_modular (fm_train_dna=traindat,fm_test_dna=testdat, label_train_dna=label_traindat,C=1,maxlen=1,decay=1): from modshogun import StringCharFeatures, BinaryLabels from modshogun import LibSVM, SubsequenceStringKernel, DNA from modshogun import ErrorRateMeasure feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) labels=BinaryLabels(label_train_dna) kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay); svm=LibSVM(C, kernel, labels); svm.train(); out=svm.apply(feats_train); evaluator = ErrorRateMeasure() trainerr = evaluator.evaluate(out,labels) # print(trainerr) kernel.init(feats_train, feats_test) predicted_labels=svm.apply(feats_test).get_labels() # print predicted_labels return predicted_labels if __name__=='__main__': print('SringSubsequenceKernel classification DNA') classifier_ssk_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and # a precision parameter epsilon=1e-5. The LINADD trick is used to speed up # training. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel and the LINADD trick see # Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String # Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston, # Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press, # Cambridge, MA. 2007. # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() train_dna=lm.load_dna('../data/fm_train_dna.dat') test_dna=lm.load_dna('../data/fm_test_dna.dat') label=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[train_dna, test_dna, label, 20, 0.9, 1e-7, 1], [train_dna, test_dna, label, 20, 2.3, 1e-7, 4]] def classifier_svmlight_batch_linadd_modular (fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG try: from modshogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print('SVMLight Objective: %f num_sv: %d' % \) # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm if __name__=='__main__': print('SVMlight batch') classifier_svmlight_batch_linadd_modular(*parameter_list[0])
# This example demonstrates how to train an SVMLight classifier # using a custom linear term. This is used in the class DASVM that # pre-computes this linear term using a previously trained SVM. # #!/usr/bin/env python import numpy traindna=['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_traindna=numpy.array(5*[-1.0] + 5*[1.0]) testdna=['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna=numpy.array(5*[-1.0] + 5*[1.0]) parameter_list = [[traindna,testdna,label_traindna,3,10,1e-5,1],[traindna,testdna,label_traindna,3,10,1e-5,1]] def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel if __name__=='__main__': print('SVMLight') classifier_svmlight_linear_term_modular(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and # the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]] def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel if __name__=='__main__': print('SVMLight') classifier_svmlight_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier (SVM) is # trained on a toy data set and the trained classifier is used to predict labels # of test examples. As training algorithm the SVMLIN solver is used with the SVM # regularization parameter C=0.9 and the bias in the classification rule switched # on and the precision parameter epsilon=1e-5. The example also shows how to # retrieve parameters (vector w and bias b)) of the trained linear classifier. # # For more details on the SVMLIN solver see # V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised # Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007 #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]] def classifier_svmlin_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1): from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels from modshogun import SVMLin, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMLin(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(True) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('SVMLin') classifier_svmlin_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the OCAS solver is used with the SVM # regularization parameter C=0.9 and the bias term in the classification rule # switched off and the precision parameter epsilon=1e-5 (duality gap). # # For more details on the OCAS solver see # V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk # Minimization.The Journal of Machine Learning Research, vol. 10, # pp. 2157--2192. October 2009. # #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,0.9,1e-5,1],[traindat,testdat,label_traindat,0.8,1e-5,1]] def classifier_svmocas_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1): from modshogun import RealFeatures, BinaryLabels from modshogun import CSVFile try: from modshogun import SVMOcas except ImportError: print("SVMOcas not available") return feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('SVMOcas') classifier_svmocas_modular(*parameter_list[0])
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is # used with the SVM regularization parameter C=0.9. The number of iterations, i.e. # passes though all training examples, is set to num_iter=5 . # # For more details on the SGD solver see # L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT # Press. 2008. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]] def classifier_svmsgd_modular (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5): from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels from modshogun import SVMSGD, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMSGD(C, feats_train, labels) svm.set_epochs(num_iter) #svm.io.set_loglevel(0) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels() if __name__=='__main__': print('SVMSGD') classifier_svmsgd_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,10],[data,20]] def converter_diffusionmaps_modular (data_fname,t): try: from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = DiffusionMaps() converter.set_target_dim(1) converter.set_kernel(GaussianKernel(10,10.0)) converter.set_t(t) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('DiffusionMaps') converter_diffusionmaps_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data]] def converter_factoranalysis_modular(data_fname): try: import numpy from modshogun import RealFeatures, FactorAnalysis, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = FactorAnalysis() converter.set_target_dim(2) embedding = converter.apply(features) X = embedding.get_feature_matrix() covdet = numpy.linalg.det(numpy.dot(X,X.T)) return covdet > 0 except ImportError: print('No Eigen3 available') if __name__=='__main__': print('Factor Analysis') converter_factoranalysis_modular(*parameter_list[0])
#!/usr/bin/env python strings=['example document 1','example document 2','example document 3','example document 4'] parameter_list=[[strings]] def converter_hasheddoc_modular(strings): from modshogun import SparseRealFeatures, RAWBYTE, StringCharFeatures, Features, HashedDocDotFeatures from modshogun import NGramTokenizer from modshogun import HashedDocConverter from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #set the number of bits of the target dimension #means a dim of size 2^5=32 num_bits=5 #create the ngram tokenizer of size 8 to parse the strings tokenizer=NGramTokenizer(8) #normalize results normalize=True #create converter converter=HashedDocConverter(tokenizer, num_bits, normalize) converted_feats=converter.apply(f) #should expect 32 #print('Converted features\' space dimensionality is', converted_feats.get_dim_feature_space()) #print('Self dot product of string 0 with converted feats:', converted_feats.dot(0, converted_feats, 0)) hashed_feats=HashedDocDotFeatures(num_bits, f, tokenizer, normalize) #print('Hashed features\' space dimensionality is', hashed_feats.get_dim_feature_space()) #print('Self dot product of string 0 with hashed feats:', hashed_feats.dot(0, hashed_feats, 0)) return converted_feats if __name__=='__main__': print('HashedDocConverter') converter_hasheddoc_modular(*parameter_list[0])
# In this example toy data is being preprocessed using the Hessian Locally Linear Embedding algorithm # as described in # # Donoho, D., & Grimes, C. (2003). # Hessian eigenmaps: new tools for nonlinear dimensionality reduction. # Proceedings of National Academy of Science (Vol. 100, pp. 5591-5596). #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_hessianlocallylinearembedding_modular (data_fname,k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import HessianLocallyLinearEmbedding except ImportError: print("HessianLocallyLinearEmbedding not available") exit(0) features = RealFeatures(CSVFile(data)) converter = HessianLocallyLinearEmbedding() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('HessianLocallyLinearEmbedding') converter_hessianlocallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using the Isomap algorithm # as described in # # Silva, V. D., & Tenenbaum, J. B. (2003). # Global versus local methods in nonlinear dimensionality reduction. # Advances in Neural Information Processing Systems 15, 15(Figure 2), 721-728. MIT Press. # Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.3407&rep=rep1&type=pdf # # Before applying to the data the landmark approximation is enabled with # specified number of landmarks. The landmark approximation is described in # # Sparse multidimensional scaling using landmark points # V De Silva, J B Tenenbaum (2004) Technology, p. 1-4 # # After enabling the landmark approximation k parameter -- the number # of neighbors in the k nearest neighbor graph -- is initialized. #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data]] def converter_isomap_modular (data_fname): from modshogun import RealFeatures, CSVFile from modshogun import Isomap features = RealFeatures(CSVFile(data)) converter = Isomap() converter.set_k(20) converter.set_target_dim(1) converter.apply(features) return features if __name__=='__main__': print('Isomap') #converter_isomap_modular(*parameter_list[0])
# In this example toy data is being processed using kernel extension # of the Locally Linear Embedding (LLE) algorithm as described in # # Kayo, O. (2006). Locally linear embedding algorithm. Extensions and applications. October. # Retrieved from: http://herkules.oulu.fi/isbn9514280415/isbn9514280415.pd # # Linear kernel is used as kernel of the extension. #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_kernellocallylinearembedding_modular (data_fname,k): try: from modshogun import RealFeatures, LinearKernel, CSVFile try: from modshogun import KernelLocallyLinearEmbedding except ImportError: print("KernelLocallyLinearEmbedding not available") exit(0) features = RealFeatures(CSVFile(data_fname)) kernel = LinearKernel() converter = KernelLocallyLinearEmbedding(kernel) converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('KernelLocallyLinearEmbedding') converter_kernellocallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using Laplacian Eigenmaps # algorithm as described in # # Belkin, M., & Niyogi, P. (2002). # Laplacian Eigenmaps and Spectral Techniques for Embedding and Clustering. # Science, 14, 585-591. MIT Press. # Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.9400&rep=rep1&type=pdf # # The number of neighbors for the kNN graph and the heat distribution # coeffcient is set before processing the data #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_laplacianeigenmaps_modular (data_fname,k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import LaplacianEigenmaps except ImportError: print("LaplacianEigenmaps not available") exit(0) features = RealFeatures(CSVFile(data_fname)) converter = LaplacianEigenmaps() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(20.0) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('LaplacianEigenmaps') converter_laplacianeigenmaps_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_linearlocaltangentspacealignment_modular (data_fname,k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import LinearLocalTangentSpaceAlignment except ImportError: print("LinearLocalTangentSpaceAlignment not available") exit(0) features = RealFeatures(CSVFile(data_fname)) converter = LinearLocalTangentSpaceAlignment() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('LinearLocalTangentSpaceAlignment') converter_linearlocaltangentspacealignment_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_localitypreservingprojections_modular (data_fname,k): from modshogun import RealFeatures, CSVFile from modshogun import LocalityPreservingProjections features = RealFeatures(CSVFile(data_fname)) converter = LocalityPreservingProjections() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(2.0) converter.apply(features) return features if __name__=='__main__': print('LocalityPreservingProjections') #converter_localitypreservingprojections_modular(*parameter_list[0])
# In this example toy data is being preprocessed using the Locally Linear Embedding (LLE) # algorithm as described in # # Saul, L. K., Ave, P., Park, F., & Roweis, S. T. (2001). # An Introduction to Locally Linear Embedding. Available from, 290(5500), 2323-2326. # Retrieved from: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7319&rep=rep1&type=pdf # # The number of neighbors used during the linear reconstruction step of the algorithm is set # before processing of the data. #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_locallylinearembedding_modular (data_fname,k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import LocallyLinearEmbedding except ImportError: print("LocallyLinearEmbedding not available") exit(0) features = RealFeatures(CSVFile(data_fname)) converter = LocallyLinearEmbedding() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('LocallyLinearEmbedding') converter_locallylinearembedding_modular(*parameter_list[0])
# In this example toy data is being processed using the Local Tangent Space # Alignment (LTSA) algorithms as described in # # Zhang, Z., & Zha, H. (2002). Principal Manifolds # and Nonlinear Dimension Reduction via Local Tangent Space Alignment. # Journal of Shanghai University English Edition, 8(4), 406-424. SIAM. # Retrieved from http://arxiv.org/abs/cs/0212008 # # Before processing the number of neighbors for computing local tangent space # is set #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data,20],[data,30]] def converter_localtangentspacealignment_modular (data_fname,k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import LocalTangentSpaceAlignment except ImportError: print("LocalTangentSpaceAlignment not available") exit(0) features = RealFeatures(CSVFile(data_fname)) converter = LocalTangentSpaceAlignment() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('LocalTangentSpaceAlignment') converter_localtangentspacealignment_modular(*parameter_list[0])
# In this example toy data is being processed using the multidimensional # scaling as described on p.261 (Section 12.1) of # # Borg, I., & Groenen, P. J. F. (2005). # Modern multidimensional scaling: Theory and applications. Springer. # # Before processing the landmark approximation is disabled. #!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data]] def converter_multidimensionalscaling_modular (data_fname): try: import numpy from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding,embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available') if __name__=='__main__': print('MultidimensionalScaling') converter_multidimensionalscaling_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data, 20]] def converter_stochasticproximityembedding_modular (data_fname, k): try: from modshogun import RealFeatures,StochasticProximityEmbedding, SPE_GLOBAL, SPE_LOCAL, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = StochasticProximityEmbedding() converter.set_target_dim(1) converter.set_nupdates(40) # Embed with local strategy converter.set_k(k) converter.set_strategy(SPE_LOCAL) converter.embed(features) # Embed with global strategy converter.set_strategy(SPE_GLOBAL) converter.embed(features) return features except ImportError: print('No Eigen3 available') if __name__=='__main__': print('StochasticProximityEmbedding') converter_stochasticproximityembedding_modular(*parameter_list[0])
#!/usr/bin/env python data = '../data/fm_train_real.dat' parameter_list = [[data]] def converter_tdistributedstochasticneighborembedding_modular(data_fname, seed=1): try: from modshogun import RealFeatures, TDistributedStochasticNeighborEmbedding from modshogun import Math_init_random, CSVFile # reproducible results Math_init_random(seed) features = RealFeatures(CSVFile(data_fname)) converter = TDistributedStochasticNeighborEmbedding() converter.set_target_dim(2) embedding = converter.apply(features) return embedding except ImportError: print('No Eigen3 available') if __name__=='__main__': print('TDistributedStochasticNeighborEmbedding') converter_tdistributedstochasticneighborembedding_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CBrayCurtisDistance.html. # # Obviously, using the Bray Curtis distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_braycurtis_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, BrayCurtisDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=BrayCurtisDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('BrayCurtisDistance') distance_braycurtis_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CCanberraMetric.html. # # Obviously, using the Canberra distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_canberra_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, CanberraMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=CanberraMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('CanberaMetric') distance_canberra_modular(*parameter_list[0])
# This example shows how to compute the Canberra Word Distance. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,3,0,False]] def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('CanberraWordDistance') distance_canberraword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (maximum of absolute feature dimension differences) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (maximum of absolute feature dimension differences) matrix between these # two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CChebyshewMetric.html. # # Obviously, using the Chebyshew distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_chebyshew_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, ChebyshewMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=ChebyshewMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('ChebyshewMetric') distance_chebyshew_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CChiSquareDistance.html. # # Obviously, using the ChiSquare distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,],[traindat,testdat]] def distance_chisquare_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, ChiSquareDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=ChiSquareDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('ChiSquareDistance') distance_chisquare_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CCosineDistance.html. # # Obviously, using the Cosine distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_cosine_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, CosineDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=CosineDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('CosineDistance') distance_cosine_modular(*parameter_list[0])
#!/usr/bin/env python import numpy from modshogun import RealFeatures, MSG_DEBUG numpy.random.seed(17) traindat = numpy.random.random_sample((10,10)) testdat = numpy.random.random_sample((10,10)) parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]] def distance_director_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2): try: from modshogun import DirectorDistance except ImportError: print("recompile shogun with --enable-swig-directors") return class DirectorEuclideanDistance(DirectorDistance): def __init__(self): DirectorDistance.__init__(self, True) def distance_function(self, idx_a, idx_b): seq1 = self.get_lhs().get_feature_vector(idx_a) seq2 = self.get_rhs().get_feature_vector(idx_b) return numpy.linalg.norm(seq1-seq2) from modshogun import EuclideanDistance from modshogun import Time feats_train=RealFeatures(fm_train_real) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.parallel.set_num_threads(1) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance() distance.init(feats_train, feats_test) ddistance=DirectorEuclideanDistance() ddistance.init(feats_train, feats_test) #print "dm_train" t=Time() dm_train=distance.get_distance_matrix() #t1=t.cur_time_diff(True) #print "ddm_train" t=Time() ddm_train=ddistance.get_distance_matrix() #t2=t.cur_time_diff(True) #print "dm_train", dm_train #print "ddm_train", ddm_train return dm_train, ddm_train if __name__=='__main__': print('DirectorEuclideanDistance') distance_director_euclidean_modular(*parameter_list[0])
#!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,traindat],[traindat,testdat]] def distance_euclidean_modular(train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('EuclideanDistance') distance_euclidean_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a # pairwise distance (shortest path on a sphere) matrix is computed # by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (shortest path on a sphere) matrix between these two data sets is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CGeodesicMetric.html. # # Obviously, using the Geodesic distance is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_geodesic_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, GeodesicMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=GeodesicMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('GeodesicMetric') distance_geodesic_modular(*parameter_list[0])
# This example shows how to compute the Hamming Word Distance for string features. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') testdat = lm.load_labels('../data/fm_test_real.dat') parameter_list = [[traindna,testdna,testdat,4,0,False,False], [traindna,testdna,testdat,3,0,False,False]] def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('HammingWordDistance') distance_hammingword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (divergence measure based on the Kullback-Leibler divergence) matrix # is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (divergence measure based on the Kullback-Leibler divergence) matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CJensenMetric.html. # # Obviously, using the Jensen-Shannon distance/divergence is not limited to # this showcase example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_jensen_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, JensenMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=JensenMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('JensenMetric') distance_jensen_modular(*parameter_list[0])
#!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat, testdat]] def distance_mahalanobis_modular (train_fname = traindat, test_fname = testdat): from modshogun import RealFeatures, CSVFile from modshogun import MahalanobisDistance feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance = MahalanobisDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('MahalanobisDistance') distance_mahalanobis_modular(*parameter_list[0])
# This example shows how to compute the Manhatten Distance. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_manhatten_modular (train_fname,test_fname=testdat): from modshogun import RealFeatures, ManhattanMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=ManhattanMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('ManhattanMetric') distance_manhatten_modular(*parameter_list[0])
# This example shows how to compute the Manahattan Distance for string features. #!/usr/bin/env python traindna = '../data/fm_train_dna.dat' testdna = '../data/fm_test_dna.dat' parameter_list = [[traindna,testdna,3,0,False],[traindna,testdna,4,0,False]] def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, ManhattanWordDistance, CSVFile charfeat=StringCharFeatures(CSVFile(train_fname), DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(CSVFile(test_fname), DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return dm_train,dm_test if __name__=='__main__': print('ManhattanWordDistance') distance_manhattenword_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) and norm 'k' controls the processing of the given data points, # where a pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CMinkowskiMetric.html. # # Obviously, using the Minkowski metric is not limited to this showcase # example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,3],[traindat,testdat,4]] def distance_minkowski_modular (train_fname=traindat,test_fname=testdat,k=3): from modshogun import RealFeatures, MinkowskiMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=MinkowskiMetric(feats_train, feats_train, k) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('MinkowskiMetric') distance_minkowski_modular(*parameter_list[0])
# In this example an squared euclidian distance is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_normsquared_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('EuclideanDistance - NormSquared') distance_normsquared_modular(*parameter_list[0])
# In this example a sparse euclidean distance is computed for sparse toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_sparseeuclidean_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, SparseRealFeatures, SparseEuclideanDistance, CSVFile realfeat=RealFeatures(CSVFile(train_fname)) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(CSVFile(test_fname)) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) distance=SparseEuclideanDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('SparseEuclideanDistance') distance_sparseeuclidean_modular(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (extended Jaccard coefficient) matrix between these two data sets is computed # by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CTanimotoDistance.html. # # Obviously, using the Tanimoto distance/coefficient is not limited to # this showcase example. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat],[traindat,testdat]] def distance_tanimoto_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, TanimotoDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=TanimotoDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test if __name__=='__main__': print('TanimotoDistance') distance_tanimoto_modular(*parameter_list[0])
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') parameter_list = [[traindna,3,0,False],[traindna,4,0,False]] def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import Histogram charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) out_likelihood = histo.get_log_likelihood() out_sample = histo.get_log_likelihood_sample() return histo,out_sample,out_likelihood ########################################################################### # call functions ########################################################################### if __name__=='__main__': print('Histogram') distribution_histogram_modular(*parameter_list[0])
# In this example a hidden markov model with 3 states and 6 transitions is trained # on a string data set. After calling the constructor of the HMM class specifying # the number of states and transitions the model is trained. Via the Baum-Welch # algorithm the optimal transition and emission probabilities are estimated. The # best path, i.e. the path with highest probability given the model can then be # calculated using get_best_path_state. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]] def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from modshogun import StringWordFeatures, StringCharFeatures, CUBE from modshogun import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in range(num_examples): best_path+=hmm.best_path(i) for j in range(N): best_path_state+=hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm ########################################################################### # call functions ########################################################################### if __name__=='__main__': print('HMM') distribution_hmm_modular(*parameter_list[0])
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to # the structure of the Markov chain it is very similar to a HMM with just one # chain of connected hidden states - that is why we termed this linear HMM. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') parameter_list = [[traindna,3,0,False],[traindna,4,0,False]] def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import LinearHMM charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm,out_likelihood ,out_sample ########################################################################### # call functions ########################################################################### if __name__=='__main__': distribution_linearhmm_modular(*parameter_list[0]) print('LinearHMM')
# In this example usage of the Positional PWM is shown #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') parameter_list = [[traindna,3],[traindna,4]] def distribution_ppwm_modular (fm_dna=traindna, order=3): from modshogun import StringByteFeatures, StringCharFeatures, DNA from modshogun import PositionalPWM from numpy import array,e,log,exp charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, 0, False) L=20 k=3 sigma = 1; mu = 4 ppwm=PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm=array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]); pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]]) ppwm.set_pwm(log(pwm)) #print(ppwm.get_pwm()) ppwm.compute_w(L) w=ppwm.get_w() #print(w) #from pylab import * #figure(1) #pcolor(exp(w)) #pcolor(w) #colorbar() #figure(2) ppwm.compute_scoring(1) u=ppwm.get_scoring(0) #pcolor(exp(u)) #show() #ppwm=PositionalPWM(feats) #ppwm.train() #out_likelihood = histo.get_log_likelihood() #out_sample = histo.get_log_likelihood_sample() return w,u ########################################################################### # call functions ########################################################################### if __name__=='__main__': print('PositionalPWM') distribution_ppwm_modular(*parameter_list[0])
# Example on how to evaluate the clustering performance (given ground-truth) #!/usr/bin/env python def get_dataset(): from os.path import exists filename = "../../../data/uci/optdigits/optdigits.tes" if exists(filename): return open(filename) else: # print("Retrieving data...") try: from urllib2 import urlopen except ImportError: from urllib.request import urlopen return urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes") def prepare_data(): from numpy import loadtxt stream = get_dataset() # print("Loading data...") data = loadtxt(stream, delimiter=',') fea = data[:, :-1] gnd = data[:, -1] return (fea.T, gnd) (fea, gnd_raw) = prepare_data() parameter_list = [[fea, gnd_raw, 10]] def run_clustering(data, k): from modshogun import KMeans from modshogun import EuclideanDistance from modshogun import RealFeatures fea = RealFeatures(data) distance = EuclideanDistance(fea, fea) kmeans=KMeans(k, distance) # print("Running clustering...") kmeans.train() return kmeans.get_cluster_centers() def assign_labels(data, centroids, ncenters): from modshogun import EuclideanDistance from modshogun import RealFeatures, MulticlassLabels from modshogun import KNN from numpy import arange labels = MulticlassLabels(arange(0.,ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea) def evaluation_clustering (features=fea, ground_truth=gnd_raw, ncenters=10): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels from modshogun import Math # reproducable results Math.init_random(1) centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) #print(('Clustering accuracy = %.4f' % accuracy)) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO mutual information does not work with serialization #return gnd, gnd_hat, accuracy, MIEval, mutual_info return gnd, gnd_hat, accuracy if __name__ == '__main__': print('Evaluation Clustering') evaluation_clustering(*parameter_list[0])
#!/usr/bin/env python parameter_list = [[1000,2,8],[1000,4,8]] from numpy import * #from pylab import * def run_clustering(data, k): from modshogun import KMeans from modshogun import Math_init_random from modshogun import EuclideanDistance from modshogun import RealFeatures fea = RealFeatures(data) distance = EuclideanDistance(fea, fea) kmeans=KMeans(k, distance) #print("Running clustering...") kmeans.train() return kmeans.get_cluster_centers() def assign_labels(data, centroids, ncenters): from modshogun import EuclideanDistance from modshogun import RealFeatures, MulticlassLabels from modshogun import KNN from numpy import arange labels = MulticlassLabels(arange(0.,ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea) def evaluation_clustering_simple (n_data=100, sqrt_num_blobs=4, distance=5): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels, GaussianBlobsDataGenerator from modshogun import Math # reproducable results Math.init_random(1) # produce sone Gaussian blobs to cluster ncenters=sqrt_num_blobs**2 stretch=1 angle=1 gen=GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle) features=gen.get_streamed_features(n_data) X=features.get_feature_matrix() # compute approximate "ground truth" labels via taking the closest blob mean coords=array(range(0,sqrt_num_blobs*distance,distance)) idx_0=[abs(coords -x).argmin() for x in X[0]] idx_1=[abs(coords -x).argmin() for x in X[1]] ground_truth=array([idx_0[i]*sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64") #for label in unique(ground_truth): # indices=ground_truth==label # plot(X[0][indices], X[1][indices], 'o') #show() centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) # in this case we know that the clustering has to be very good #print(('Clustering accuracy = %.4f' % accuracy)) assert(accuracy>0.8) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) return gnd, accuracy, mutual_info if __name__ == '__main__': print('Evaluation Clustering') evaluation_clustering_simple(*parameter_list[0])
# In this example various (accuracy, error rate, ..) measures are being computed # for the pair of ground truth toy data and random data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_contingencytableevaluation_modular (ground_truth, predicted): from modshogun import BinaryLabels from modshogun import ContingencyTableEvaluation from modshogun import AccuracyMeasure,ErrorRateMeasure,BALMeasure from modshogun import WRACCMeasure,F1Measure,CrossCorrelationMeasure from modshogun import RecallMeasure,PrecisionMeasure,SpecificityMeasure ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) base_evaluator = ContingencyTableEvaluation() base_evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = AccuracyMeasure() accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = ErrorRateMeasure() errorrate = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = BALMeasure() bal = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = WRACCMeasure() wracc = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = F1Measure() f1 = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = CrossCorrelationMeasure() crosscorrelation = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = RecallMeasure() recall = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = PrecisionMeasure() precision = evaluator.evaluate(predicted_labels,ground_truth_labels) evaluator = SpecificityMeasure() specificity = evaluator.evaluate(predicted_labels,ground_truth_labels) return accuracy, errorrate, bal, wracc, f1, crosscorrelation, recall, precision, specificity if __name__=='__main__': print('EvaluationContingencyTableEvaluation') evaluation_contingencytableevaluation_modular(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # from numpy.random import randn from numpy import * # generate some overlapping training vectors num_vectors=100 vec_distance=1 traindat=concatenate((randn(2,num_vectors)-vec_distance, randn(2,num_vectors)+vec_distance), axis=1) label_traindat=concatenate((-ones(num_vectors), ones(num_vectors))); parameter_list = [[traindat,label_traindat]] def evaluation_cross_validation_classification (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import BinaryLabels from modshogun import RealFeatures from modshogun import LibLinear, L2R_L2LOSS_SVC # training data features=RealFeatures(traindat) labels=BinaryLabels(label_traindat) # classifier classifier=LibLinear(L2R_L2LOSS_SVC) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "CrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # perform cross-validation and print(results) result=cross_validation.evaluate() #print("mean:", result.mean) if __name__=='__main__': print('Evaluation CrossValidationClassification') evaluation_cross_validation_classification(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # from numpy.random import randn from numpy import * # generate some overlapping training vectors num_vectors=5 vec_distance=1 traindat=concatenate((randn(2,num_vectors)-vec_distance, randn(2,num_vectors)+vec_distance), axis=1) label_traindat=concatenate((-ones(num_vectors), ones(num_vectors))); parameter_list = [[traindat,label_traindat]] def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import BinaryLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import LibSVM, MKLClassification # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLClassification(LibSVM()); svm.set_interleaved_optimization_enabled(False); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage=CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # print mkl weights weights=mkl_storage.get_mkl_weights() #print "mkl weights during cross--validation" #print weights if __name__=='__main__': print('Evaluation CrossValidationClassification') evaluation_cross_validation_mkl_weight_storage(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # from numpy.random import randn, seed from numpy import * # generate some overlapping training vectors seed(1) num_vectors=7 vec_distance=1 traindat=concatenate((randn(2,num_vectors)-vec_distance, randn(2,num_vectors)+vec_distance), axis=1) label_traindat=concatenate((zeros(num_vectors), ones(num_vectors))); parameter_list = [[traindat,label_traindat]] def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage from modshogun import MulticlassAccuracy, F1Measure from modshogun import StratifiedCrossValidationSplitting from modshogun import MulticlassLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import MKLMulticlass from modshogun import Statistics, MSG_DEBUG, Math Math.init_random(1) # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0 if __name__=='__main__': print('Evaluation CrossValidationMulticlassStorage') evaluation_cross_validation_multiclass_storage(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # traindat = '../data/fm_train_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,label_traindat,0.8,1e-6],[traindat,label_traindat,0.9,1e-7]] def evaluation_cross_validation_regression (train_fname=traindat,label_fname=label_traindat,width=0.8,tau=1e-6): from modshogun import CrossValidation, CrossValidationResult from modshogun import MeanSquaredError, CrossValidationSplitting from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel, KernelRidgeRegression, CSVFile # training data features=RealFeatures(CSVFile(train_fname)) labels=RegressionLabels(CSVFile(label_fname)) # kernel and predictor kernel=GaussianKernel() predictor=KernelRidgeRegression(tau, kernel, labels) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but here, the std x-val is used splitting_strategy=CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=MeanSquaredError() # cross-validation instance cross_validation=CrossValidation(predictor, features, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # (optional) tell machine to precompute kernel matrix. speeds up. may not work predictor.data_lock(labels, features) # perform cross-validation and print(results) result=cross_validation.evaluate() #print("mean:", result.mean) if __name__=='__main__': print('Evaluation CrossValidationClassification') evaluation_cross_validation_regression(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_director_contingencytableevaluation_modular (ground_truth, predicted): try: from modshogun import DirectorContingencyTableEvaluation, ED_MAXIMIZE except ImportError: print("recompile shogun with --enable-swig-directors") return class SimpleWeightedBinaryEvaluator(DirectorContingencyTableEvaluation): def __init__(self): DirectorContingencyTableEvaluation.__init__(self) def get_custom_direction(self): return ED_MAXIMIZE def get_custom_score(self): return self.get_WRACC()+self.get_BAL() from modshogun import BinaryLabels evaluator = SimpleWeightedBinaryEvaluator() r = evaluator.evaluate(BinaryLabels(ground_truth), BinaryLabels(predicted)) r2 = evaluator.get_custom_score() print(r,r2) return r,r2 if __name__=='__main__': print('EvaluationDirectorContingencyTableEvaluation') evaluation_director_contingencytableevaluation_modular(*parameter_list[0])
# In this example a mean squared error (MSE) is being computed # for the pair of random vectors of length N. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() N = 100 random.seed(17) ground_truth = random.randn(N) predicted = random.randn(N) parameter_list = [[ground_truth,predicted]] def evaluation_meansquarederror_modular (ground_truth, predicted): from modshogun import RegressionLabels from modshogun import MeanSquaredError ground_truth_labels = RegressionLabels(ground_truth) predicted_labels = RegressionLabels(predicted) evaluator = MeanSquaredError() mse = evaluator.evaluate(predicted_labels,ground_truth_labels) return mse if __name__=='__main__': print('MeanSquaredError') evaluation_meansquarederror_modular(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() N = 100 random.seed(17) ground_truth = abs(random.randn(N)) predicted = abs(random.randn(N)) parameter_list = [[ground_truth,predicted]] def evaluation_meansquaredlogerror_modular (ground_truth, predicted): from modshogun import RegressionLabels from modshogun import MeanSquaredLogError ground_truth_labels = RegressionLabels(ground_truth) predicted_labels = RegressionLabels(predicted) evaluator = MeanSquaredLogError() mse = evaluator.evaluate(predicted_labels,ground_truth_labels) return mse if __name__=='__main__': print('EvaluationMeanSquaredLogError') evaluation_meansquaredlogerror_modular(*parameter_list[0])
# In this example a multiclass accuracy is being computed for toy data labels # and toy data labels multiplied by two. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() random.seed(17) ground_truth = lm.load_labels('../data/label_train_multiclass.dat') predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2 parameter_list = [[ground_truth,predicted]] def evaluation_multiclassaccuracy_modular (ground_truth, predicted): from modshogun import MulticlassLabels from modshogun import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) return accuracy if __name__=='__main__': print('MulticlassAccuracy') evaluation_multiclassaccuracy_modular(*parameter_list[0])
#!/usr/bin/env python traindat = '../data/fm_train_real.dat' label_traindat = '../data/label_train_multiclass.dat' parameter_list = [[traindat, label_traindat]] def evaluation_multiclassovrevaluation_modular(train_fname=traindat, label_fname=label_traindat): from modshogun import MulticlassOVREvaluation,ROCEvaluation from modshogun import MulticlassLibLinear,RealFeatures,ContingencyTableEvaluation,ACCURACY from modshogun import MulticlassLabels, Math, CSVFile Math.init_random(1) ground_truth_labels = MulticlassLabels(CSVFile(label_fname)) svm = MulticlassLibLinear(1.0,RealFeatures(CSVFile(train_fname)),ground_truth_labels) svm.parallel.set_num_threads(1) svm.train() predicted_labels = svm.apply() binary_evaluator = ROCEvaluation() evaluator = MulticlassOVREvaluation(binary_evaluator) mean_roc = evaluator.evaluate(predicted_labels,ground_truth_labels) #print mean_roc binary_evaluator = ContingencyTableEvaluation(ACCURACY) evaluator = MulticlassOVREvaluation(binary_evaluator) mean_accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) #print mean_accuracy return mean_roc, mean_accuracy, predicted_labels, svm if __name__=='__main__': print('MulticlassOVREvaluation') evaluation_multiclassovrevaluation_modular(*parameter_list[0])
# In this example PRC (Precision-Recall curve) is being computed # for the pair of ground truth toy labels and random labels. # PRC curve (as matrix) and auPRC (area under PRC) is returned. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_prcevaluation_modular (ground_truth, predicted): from modshogun import BinaryLabels from modshogun import PRCEvaluation ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) evaluator = PRCEvaluation() evaluator.evaluate(predicted_labels,ground_truth_labels) return evaluator.get_PRC(), evaluator.get_auPRC() if __name__=='__main__': print('PRCEvaluation') evaluation_prcevaluation_modular(*parameter_list[0])
# In this example ROC (Receiver Operator Characteristic) is being computed # for the pair of ground truth toy labels and random labels. # ROC curve (as matrix) and auROC (area under ROC) is returned. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_rocevaluation_modular (ground_truth, predicted): from modshogun import BinaryLabels from modshogun import ROCEvaluation ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) evaluator = ROCEvaluation() evaluator.evaluate(predicted_labels,ground_truth_labels) return evaluator.get_ROC(), evaluator.get_auROC() if __name__=='__main__': print('ROCEvaluation') evaluation_rocevaluation_modular(*parameter_list[0])
#!/usr/bin/env python parameter_list = [[1000]] def evaluation_thresholds_modular (index): from modshogun import BinaryLabels, ROCEvaluation import numpy numpy.random.seed(17) output=numpy.arange(-1,1,0.001) output=(0.3*output+0.7*(numpy.random.rand(len(output))-0.5)) label=[-1.0]*(len(output)//2) label.extend([1.0]*(len(output)//2)) label=numpy.array(label) pred=BinaryLabels(output) truth=BinaryLabels(label) evaluator=ROCEvaluation() evaluator.evaluate(pred, truth) [fp,tp]=evaluator.get_ROC() thresh=evaluator.get_thresholds() b=thresh[index] #print("tpr", numpy.mean(output[label>0]>b), tp[index]) #print("fpr", numpy.mean(output[label<0]>b), fp[index]) return tp[index],fp[index],numpy.mean(output[label>0]>b),numpy.mean(output[label<0]>b) if __name__=='__main__': print('Evaluation with Thresholds') evaluation_thresholds_modular(*parameter_list[0])
#!/usr/bin/env python import numpy matrix=numpy.array([[-1.0,0,1],[2,3,4],[5,6,7]]) bins=numpy.array([[0.0, 0.0, 0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[4.0,4.0,4.0]]) parameter_list = [(matrix,bins)] def features_binned_dot_modular (matrix, bins): from modshogun import RealFeatures, BinnedDotFeatures rf=RealFeatures(matrix) #print(rf.get_feature_matrix()) bf=BinnedDotFeatures(rf, bins) filled=bf.get_computed_dot_feature_matrix() bf.set_fill(False) unfilled=bf.get_computed_dot_feature_matrix() bf.set_norm_one(True) unfilled_normed=bf.get_computed_dot_feature_matrix() bf.set_fill(True) filled_normed=bf.get_computed_dot_feature_matrix() return bf,filled,unfilled,unfilled_normed,filled_normed if __name__=='__main__': print('BinnedDotFeatures') features_binned_dot_modular(*parameter_list[0])
#!/usr/bin/env python import numpy # create dense matrix A A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.uint8) parameter_list=[[A]] def features_dense_byte_modular (A): from modshogun import ByteFeatures # create dense features a # ... of type Byte a=ByteFeatures(A) # print(some statistics about a) #print(a.get_num_vectors()) #print(a.get_num_features()) # get first feature vector and set it #print(a.get_feature_vector(0)) a.set_feature_vector(numpy.array([1,4,0,0,0,9], dtype=numpy.uint8), 0) # get matrix a_out = a.get_feature_matrix() #print(type(a_out), a_out.dtype) #print(a_out ) assert(numpy.all(a_out==A)) return a_out,a if __name__=='__main__': print('ByteFeatures') features_dense_byte_modular(*parameter_list[0])
#!/usr/bin/env python parameter_list=[[]] def features_dense_io_modular(): from modshogun import RealFeatures, CSVFile feats=RealFeatures() f=CSVFile("../data/fm_train_real.dat","r") f.set_delimiter(" ") feats.load(f) return feats if __name__=='__main__': print('Dense Real Features IO') features_dense_io_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import LongIntFeatures from numpy import array, int64, all # create dense matrix A matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) parameter_list = [[matrix]] # ... of type LongInt def features_dense_longint_modular (A=matrix): a=LongIntFeatures(A) # get first feature vector and set it a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0) # get matrix a_out = a.get_feature_matrix() assert(all(a_out==A)) return a_out if __name__=='__main__': print('dense_longint') features_dense_longint_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import RealFeatures, LongIntFeatures, ByteFeatures from numpy import array, float64, int64, uint8, all # create dense matrices A,B,C matrixA=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) matrixB=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) matrixC=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Real, LongInt and Byte parameter_list = [[matrixA,matrixB,matrixC]] def features_dense_modular (A=matrixA,B=matrixB,C=matrixC): a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print(some statistics about a) # get first feature vector and set it a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() assert(all(a_out==A)) assert(all(b_out==B)) assert(all(c_out==C)) return a_out,b_out,c_out,a,b,c if __name__=='__main__': print('dense') features_dense_modular(*parameter_list[0])
#!/usr/bin/env python import numpy from modshogun import RealFeatures from modshogun import LongIntFeatures from numpy import array, float64, int64 # create dense matrice data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]] parameter_list = [[data]] def features_dense_protocols_modular (in_data=data): m_real=array(in_data, dtype=float64, order='F') f_real=RealFeatures(m_real) #print m_real #print f_real #print f_real[-1] #print f_real[1, 2] #print f_real[-1:3] #print f_real[2, 0:2] #print f_real[0:3, 1] #print f_real[0:3, 1:2] #print f_real[:,1] #print f_real[1,:] #print m_real[-2] f_real[-1]=m_real[-2] #print f_real[-1] #print m_real[0, 1] f_real[1,2]=m_real[0,1] #print f_real[1, 2] #print m_real[0:2] f_real[1:3]=m_real[0:2] #print f_real[1:3] #print m_real[0, 0:2] f_real[2, 0:2]=m_real[0,0:2] #print f_real[2, 0:2] #print m_real[0:3, 2] f_real[0:3,1]=m_real[0:3, 2] #print f_real[0:3, 1] #print m_real[0:3, 0:1] f_real[0:3,1:2]=m_real[0:3,0:1] #print f_real[0:3, 1:2] f_real[:,0]=0 #print f_real.get_feature_matrix() if numpy.__version__ >= '1.5': f_real+=m_real f_real*=m_real f_real-=m_real else: print("numpy version >= 1.5 is needed") return None f_real+=f_real f_real*=f_real f_real-=f_real #print f_real #print f_real.get_feature_matrix() try: mem_real=memoryview(f_real) except NameError: print("Python2.7 and later is needed for memoryview class") return ret_real=array(f_real) #print ret_real return f_real[:,0] if __name__=='__main__': print('dense_protocols') features_dense_protocols_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import RealFeatures from numpy import array, float64, all # create dense matrices A,B,C matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) parameter_list = [[matrix]] # ... of type LongInt def features_dense_real_modular (A=matrix): # ... of type Real, LongInt and Byte a=RealFeatures(A) # print(some statistics about a) #print(a.get_num_vectors()) #print(a.get_num_features()) # get first feature vector and set it #print(a.get_feature_vector(0)) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() assert(all(a_out==A)) return a_out if __name__=='__main__': print('dense_real') features_dense_real_modular(*parameter_list[0])
#!/usr/bin/env python import numpy from modshogun import RealFeatures from numpy import array, float64, int64 # create dense matrice data=[[1,2,3],[4,5,6],[7,8,9],[-1,-2,-3]] parameter_list = [[data]] def features_dense_zero_copy_modular (in_data=data): feats = None if numpy.__version__ >= '1.5': feats=numpy.array(in_data, dtype=float64, order='F') a=RealFeatures() a.frombuffer(feats, False) b=numpy.array(a, copy=False) c=numpy.array(a, copy=True) d=RealFeatures() d.frombuffer(a, False) e=RealFeatures() e.frombuffer(a, True) a[:,0]=0 #print a[0:4] #print b[0:4] #print c[0:4] #print d[0:4] #print e[0:4] else: print("numpy version >= 1.5 is needed") return feats if __name__=='__main__': print('dense_zero_copy') features_dense_zero_copy_modular(*parameter_list[0])
#!/usr/bin/env python import numpy from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,0.9,1e-3],[traindat,testdat,label_traindat,0.8,1e-2]] def features_director_dot_modular (fm_train_real, fm_test_real, label_train_twoclass, C, epsilon): try: from modshogun import DirectorDotFeatures from modshogun import RealVector except ImportError: print("recompile shogun with --enable-swig-directors") return class NumpyFeatures(DirectorDotFeatures): # variables data=numpy.empty((1,1)) # constructor def __init__(self, d): DirectorDotFeatures.__init__(self) self.data = d # overloaded methods def add_to_dense_sgvec(self, alpha, vec_idx1, vec2, abs): if abs: vec2+=alpha*numpy.abs(self.data[:,vec_idx1]) else: vec2+=alpha*self.data[:,vec_idx1] def dot(self, vec_idx1, df, vec_idx2): return numpy.dot(self.data[:,vec_idx1], df.get_computed_dot_feature_vector(vec_idx2)) def dense_dot_sgvec(self, vec_idx1, vec2): return numpy.dot(self.data[:,vec_idx1], vec2[0:vec2.vlen]) def get_num_vectors(self): return self.data.shape[1] def get_dim_feature_space(self): return self.data.shape[0] # operators # def __add__(self, other): # return NumpyFeatures(self.data+other.data) # def __sub__(self, other): # return NumpyFeatures(self.data-other.data) # def __iadd__(self, other): # return NumpyFeatures(self.data+other.data) # def __isub__(self, other): # return NumpyFeatures(self.data-other.data) #from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels #from modshogun import LibLinear, L2R_L2LOSS_SVC_DUAL #from modshogun import Math_init_random #Math_init_random(17) #feats_train=RealFeatures(fm_train_real) #feats_test=RealFeatures(fm_test_real) #labels=BinaryLabels(label_train_twoclass) #dfeats_train=NumpyFeatures(fm_train_real) #dfeats_test=NumpyFeatures(fm_test_real) #dlabels=BinaryLabels(label_train_twoclass) #print feats_train.get_computed_dot_feature_matrix() #print dfeats_train.get_computed_dot_feature_matrix() #svm=LibLinear(C, feats_train, labels) #svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) #svm.set_epsilon(epsilon) #svm.set_bias_enabled(True) #svm.train() #svm.set_features(feats_test) #svm.apply().get_labels() #predictions = svm.apply() #dfeats_train.__disown__() #dfeats_train.parallel.set_num_threads(1) #dsvm=LibLinear(C, dfeats_train, dlabels) #dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) #dsvm.set_epsilon(epsilon) #dsvm.set_bias_enabled(True) #dsvm.train() #dfeats_test.__disown__() #dfeats_test.parallel.set_num_threads(1) #dsvm.set_features(dfeats_test) #dsvm.apply().get_labels() #dpredictions = dsvm.apply() #return predictions, svm, predictions.get_labels() if __name__=='__main__': print('DirectorLinear') features_director_dot_modular(*parameter_list[0])
#!/usr/bin/env python strings=['hey','guys','i','am','a','string'] parameter_list=[[strings]] def features_hasheddocdot_modular(strings): from modshogun import StringCharFeatures, RAWBYTE from modshogun import HashedDocDotFeatures from modshogun import NGramTokenizer from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #set the number of bits of the target dimension #means a dim of size 2^5=32 num_bits=5 #create the ngram tokenizer of size 8 to parse the strings tokenizer=NGramTokenizer(8) #normalize results normalize=True #create HashedDocDot features hddf=HashedDocDotFeatures(num_bits, f, tokenizer, normalize) #should expect 32 #print('Feature space dimensionality is', hddf.get_dim_feature_space()) #print('Self dot product of string 0', hddf.dot(0, hddf, 0)) return hddf if __name__=='__main__': print('HashedDocDotFeatures') features_hasheddocdot_modular(*parameter_list[0])
# This example shows how to read and write plain ascii files, binary files and # hdf5 datasets. # # For ascii files it shows how to obtain shogun's RealFeatures # (a simple feature matrix of doubles with 1 column == 1 example, nr_columns == # number of examples) and also sparse features in SVM light format. # # Binary files use some custom native format and datasets can be read/written # from/to hdf5 files with arbitrary group / path. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data=lm.load_numbers('../data/fm_train_real.dat') label=lm.load_numbers('../data/label_train_twoclass.dat') parameter_list=[[data,label]] def features_io_modular (fm_train_real, label_train_twoclass): import numpy from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File feats=SparseRealFeatures(fm_train_real) feats2=SparseRealFeatures() f=BinaryFile("tmp/fm_train_sparsereal.bin","w") feats.save(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w") feats.save(f) f=BinaryFile("tmp/fm_train_sparsereal.bin") feats2.load(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii") feats2.load(f) feats=RealFeatures(fm_train_real) feats2=RealFeatures() f=BinaryFile("tmp/fm_train_real.bin","w") feats.save(f) f=HDF5File("tmp/fm_train_real.h5","w", "/data/doubles") feats.save(f) f=CSVFile("tmp/fm_train_real.ascii","w") feats.save(f) f=BinaryFile("tmp/fm_train_real.bin") feats2.load(f) #print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) f=CSVFile("tmp/fm_train_real.ascii") feats2.load(f) #print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0])) lab2=MulticlassLabels() f=CSVFile("tmp/label_train_twoclass.ascii","w") lab.save(f) f=BinaryFile("tmp/label_train_twoclass.bin","w") lab.save(f) f=HDF5File("tmp/label_train_real.h5","w", "/data/labels") lab.save(f) f=CSVFile("tmp/label_train_twoclass.ascii") lab2.load(f) f=BinaryFile("tmp/label_train_twoclass.bin") lab2.load(f) f=HDF5File("tmp/fm_train_real.h5","r", "/data/doubles") feats2.load(f) #print(feats2.get_feature_matrix()) f=HDF5File("tmp/label_train_real.h5","r", "/data/labels") lab2.load(f) #print(lab2.get_labels()) #clean up import os for f in ['tmp/fm_train_sparsereal.bin','tmp/fm_train_sparsereal.ascii', 'tmp/fm_train_real.bin','tmp/fm_train_real.h5','tmp/fm_train_real.ascii', 'tmp/label_train_real.h5', 'tmp/label_train_twoclass.ascii','tmp/label_train_twoclass.bin']: os.unlink(f) return feats, feats2, lab, lab2 if __name__=='__main__': print('Features IO') features_io_modular(*parameter_list[0])
# This example demonstrates how to read and write data in the SVMLight Format # from Shogun. # #!/usr/bin/env python parameter_list=[['../data/train_sparsereal.light']] def features_read_svmlight_format_modular (fname): import os from modshogun import SparseRealFeatures from modshogun import LibSVMFile f=SparseRealFeatures() lab=f.load_with_labels(LibSVMFile(fname)) f.save_with_labels(LibSVMFile('tmp/testwrite.light', 'w'), lab) os.unlink('tmp/testwrite.light') if __name__=='__main__': print('Reading SVMLIGHT format') features_read_svmlight_format_modular(*parameter_list[0])
# Creates features similar to the feature space of the SNP kernel. Useful when # working with linear methods. #!/usr/bin/env python parameter_list=[['../data/snps.dat']] def features_snp_modular (fname): from modshogun import StringByteFeatures, SNPFeatures, SNP sf=StringByteFeatures(SNP) sf.load_ascii_file(fname, False, SNP, SNP) #print(sf.get_features()) snps=SNPFeatures(sf) #print(snps.get_feature_matrix()) #print(snps.get_minor_base_string()) #print(snps.get_major_base_string()) if __name__=='__main__': print('SNP Features') features_snp_modular(*parameter_list[0])
# This example demsonstrates how to encode sparse (most entries zero), # real-valued features in shogun using SparseRealFeatures. #!/usr/bin/env python import numpy # create dense matrix A A=numpy.array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=numpy.float64) parameter_list=[[A]] def features_sparse_modular (A): from scipy.sparse import csc_matrix from modshogun import SparseRealFeatures from numpy import array, float64, all # sparse representation X of dense matrix A # note, will work with types other than float64 too, # but requires recent scipy.sparse X=csc_matrix(A) #print(A) # create sparse shogun features from dense matrix A a=SparseRealFeatures(A) a_out=a.get_full_feature_matrix() #print(a_out) assert(all(a_out==A)) #print(a_out) # create sparse shogun features from sparse matrix X a.set_sparse_feature_matrix(X) a_out=a.get_full_feature_matrix() #print(a_out) assert(all(a_out==A)) # create sparse shogun features from sparse matrix X a=SparseRealFeatures(X) a_out=a.get_full_feature_matrix() #print(a_out) assert(all(a_out==A)) # obtain (data,row,indptr) csc arrays of sparse shogun features z=csc_matrix(a.get_sparse_feature_matrix()) z_out=z.todense() #print(z_out) assert(all(z_out==A)) if __name__=='__main__': print('Sparse Features') features_sparse_modular(*parameter_list[0])
# This example demonstrates how to use compressed strings with shogun. # We currently support reading and writing compressed files using # LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract # compressed streams on-the-fly in order to fit data sets into # memory that would be too large, otherwise. # #!/usr/bin/env python parameter_list = [['features_string_char_compressed_modular.py']] def features_string_char_compressed_modular (fname): from modshogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE from modshogun import UNCOMPRESSED,SNAPPY,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG from modshogun import DecompressCharString f=StringFileCharFeatures(fname, RAWBYTE) #print("original strings", f.get_features()) #uncompressed f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_uncompressed.str", True) #print("uncompressed strings", f2.get_features()) #print # load compressed data and uncompress on load #snappy - not stable yet?! #f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9) #f2=StringCharFeatures(RAWBYTE); #f2.load_compressed("tmp/foo_snappy.str", True) #print("snappy strings", f2.get_features()) #print #lzo f.save_compressed("tmp/foo_lzo.str", LZO, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", True) #print("lzo strings", f2.get_features()) #print ##gzip f.save_compressed("tmp/foo_gzip.str", GZIP, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_gzip.str", True) #print("gzip strings", f2.get_features()) #print #bzip2 f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_bzip2.str", True) #print("bzip2 strings", f2.get_features()) #print #lzma f.save_compressed("tmp/foo_lzma.str", LZMA, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzma.str", True) #print("lzma strings", f2.get_features()) #print # load compressed data and uncompress via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", False) f2.add_preprocessor(DecompressCharString(LZO)) f2.apply_preprocessor() #print("lzo strings", f2.get_features()) #print # load compressed data and uncompress on-the-fly via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", False) #f2.io.set_loglevel(MSG_DEBUG) f2.add_preprocessor(DecompressCharString(LZO)) f2.enable_on_the_fly_preprocessing() #print("lzo strings", f2.get_features()) #print #clean up import os for f in ['tmp/foo_uncompressed.str', 'tmp/foo_snappy.str', 'tmp/foo_lzo.str', 'tmp/foo_gzip.str', 'tmp/foo_bzip2.str', 'tmp/foo_lzma.str', 'tmp/foo_lzo.str', 'tmp/foo_lzo.str']: if os.path.exists(f): os.unlink(f) ########################################################################################## # some perfectly compressible stuff follows ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## if __name__=='__main__': print('Compressing StringCharFileFeatures') features_string_char_compressed_modular(*parameter_list[0])
# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun. #!/usr/bin/env python strings=['hey','guys','i','am','a','string'] parameter_list=[[strings]] def features_string_char_modular (strings): from modshogun import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("string[5]", ''.join(f.get_feature_vector(5))) #print("strings", f.get_features()) #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) #print("strings", f.get_features()) return f.get_features(), f if __name__=='__main__': print('StringCharFeatures') features_string_char_modular(*parameter_list[0])
# This example demonstrates how to load ASCII features from a file into shogun. #!/usr/bin/env python parameter_list = [['features_string_file_char_modular.py']] def features_string_file_char_modular (fname): from modshogun import StringFileCharFeatures, RAWBYTE f = StringFileCharFeatures(fname, RAWBYTE) #print("strings", f.get_features()) return f if __name__=='__main__': print('Compressing StringCharFileFeatures') features_string_file_char_modular(*parameter_list[0])
# This example demonstrates how to load string features from files. # We cover two cases: First, we show how to obtain StringCharFeatues # from a directory of text files (particularly useful in computational biology) # and second, we demonstrate how to load StringCharFeatues from one (multi-line) file. # #!/usr/bin/env python parameter_list=[[".", "features_string_char_modular.py"]] def features_string_file_modular (directory, fname): from modshogun import StringCharFeatures, RAWBYTE from modshogun import CSVFile # load features from directory f=StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil=CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f if __name__=='__main__': print('StringWordFeatures') features_string_file_modular(*parameter_list[0])
# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted # Degree kernel feature space via hashes. These features can be particularly fast # in linear SVM solvers. #!/usr/bin/env python from modshogun import LongIntFeatures from numpy import array, int64, all # create dense matrix A matrix=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) parameter_list = [[matrix,3,1,2],[matrix,3,1,2]] # ... of type LongInt def features_string_hashed_wd_modular (A=matrix,order=3,start_order=1,hash_bits=2): a=LongIntFeatures(A) from numpy import array, uint8 from modshogun import HashedWDFeatures, StringByteFeatures, RAWDNA from modshogun import MSG_DEBUG x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)] from_order=order f=StringByteFeatures(RAWDNA) #f.io.set_loglevel(MSG_DEBUG) f.set_features(x) y=HashedWDFeatures(f,start_order,order,from_order,hash_bits) fm=y.get_computed_dot_feature_matrix() return fm if __name__=='__main__': print('string_hashed_wd') features_string_hashed_wd_modular(*parameter_list[0])
# In this example, we demonstrate how to obtain string features # by using a sliding window in a memory-efficient way. Instead of copying # the string for each position of the sliding window, we only store a reference # with respect to the complete string. This is particularly useful, when working # with genomic data, where storing all explicitly copied strings in memory # quickly becomes infeasible. In addition to a sliding window (of a particular # length) over all position, we also support defining a custom position # list. #!/usr/bin/env python # create string features with a single string s=10*'A' + 10*'C' + 10*'G' + 10*'T' parameter_list=[[s]] def features_string_sliding_window_modular (strings): from modshogun import StringCharFeatures, DNA from modshogun import DynamicIntArray f=StringCharFeatures([strings], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) #print(f.get_features()) # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) #print(f.get_features()) return f if __name__=='__main__': print('Sliding Window') features_string_sliding_window_modular(*parameter_list[0])
# This example demonstrates how to encode string # features efficiently by creating a more compactly encoded # bit-string from StringCharFeatures. # For instance, when working with the DNA alphabet {A,T,G,C} # using 1 char = 1 byte per symbol would be wasteful, as we # can encode 4 symbols using 2 bits only. # Here, this is done in junks of 64bit (ulong). #!/usr/bin/env python parameter_list = [[0,2,0,False],[0,3,0,False]] def features_string_ulong_modular (start=0,order=2,gap=0,rev=False): from modshogun import StringCharFeatures, StringUlongFeatures, RAWBYTE from numpy import array, uint64 #create string features cf=StringCharFeatures(['hey','guys','string'], RAWBYTE) uf=StringUlongFeatures(RAWBYTE) uf.obtain_from_char(cf, start,order,gap,rev) #replace string 0 uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0) return uf.get_features(),uf.get_feature_vector(2), uf.get_num_vectors() if __name__=='__main__': print('simple_longint') features_string_ulong_modular(*parameter_list[0])
# This example demonstrates how to encode string # features efficiently by creating a more compactly encoded # bit-string from StringCharFeatures. # For instance, when working with the DNA alphabet {A,T,G,C} # using 1 char = 1 byte per symbol would be wasteful, as we # can encode 4 symbols using 2 bits only. # Here, this is done in junks of 16bit (word). #!/usr/bin/env python strings=['hey','guys','string'] parameter_list=[[strings,0,2,0,False]] def features_string_word_modular (strings, start, order, gap, rev): from modshogun import StringCharFeatures, StringWordFeatures, RAWBYTE from numpy import array, uint16 #create string features cf=StringCharFeatures(strings, RAWBYTE) wf=StringWordFeatures(RAWBYTE) wf.obtain_from_char(cf, start, order, gap, rev) #and output several stats #print("max string length", wf.get_max_vector_length()) #print("number of strings", wf.get_num_vectors()) #print("length of first string", wf.get_vector_length(0)) #print("string[2]", wf.get_feature_vector(2)) #print("strings", wf.get_features()) #replace string 0 wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0) #print("strings", wf.get_features()) return wf.get_features(), wf if __name__=='__main__': print('StringWordFeatures') features_string_word_modular(*parameter_list[0])
# In this example the ANOVA kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,2,10], [traindat,testdat,5,10]] def kernel_anova_modular (train_fname=traindat,test_fname=testdat,cardinality=2, size_cache=10): from modshogun import ANOVAKernel,RealFeatures,CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train, km_test, kernel if __name__=='__main__': print('ANOVA') kernel_anova_modular(*parameter_list[0])
# This example demonstrates the use of the AUC Kernel, which # can be used to maximize AUC instead of margin in SVMs. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list = [[traindat,label_traindat,1.7], [traindat,label_traindat,1.6]] def kernel_auc_modular (train_fname=traindat,label_fname=label_traindat,width=1.7): from modshogun import GaussianKernel, AUCKernel, RealFeatures from modshogun import BinaryLabels, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) subkernel=GaussianKernel(feats_train, feats_train, width) kernel=AUCKernel(0, subkernel) kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname))) km_train=kernel.get_kernel_matrix() return kernel if __name__=='__main__': print('AUC') kernel_auc_modular(*parameter_list[0])
# In this example the Cauchy kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]] def kernel_cauchy_modular (train_fname=traindat,test_fname=testdat, sigma=1.0): from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=CauchyKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Cauchy') kernel_cauchy_modular(*parameter_list[0])
# This is an example for the initialization of the chi2-kernel on real data, where # each column of the matrices corresponds to one training/test example. #!/usr/bin/env python traindat = '../data/fm_train_hist.dat' testdat = '../data/fm_test_hist.dat' parameter_list = [[traindat,testdat,1.4,10], [traindat,testdat,1.5,10]] def kernel_chi2_modular (train_fname=traindat,test_fname=testdat,width=1.4, size_cache=10): from modshogun import RealFeatures, Chi2Kernel, CSVFile, NormOne feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Chi2') kernel_chi2_modular(*parameter_list[0])
# In this example the circular kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_circular_modular(train_fname=traindat,test_fname=testdat, sigma=1.0): from modshogun import RealFeatures, CircularKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=CircularKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Circular') kernel_circular_modular(*parameter_list[0])
# In this example the combined kernel of custom kernel and poly kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_twoclass.dat' parameter_list= [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]] def kernel_combined_custom_poly_modular (train_fname = traindat,test_fname = testdat,train_label_fname=label_traindat): from modshogun import CombinedFeatures, RealFeatures, BinaryLabels from modshogun import CombinedKernel, PolyKernel, CustomKernel from modshogun import LibSVM, CSVFile kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(CSVFile(train_fname)) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(CSVFile(train_fname)) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(CSVFile(train_label_fname)) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(CSVFile(test_fname)) tkernel = PolyKernel(10,3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(CSVFile(test_fname)) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train=kernel.get_kernel_matrix() return km_train,kernel if __name__=='__main__': kernel_combined_custom_poly_modular(*parameter_list[0])
# This is an example for the initialization of a combined kernel, which is a weighted sum of # in this case three kernels on real valued data. The sub-kernel weights are all set to 1. # #!/usr/bin/env python from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() traindat = double(lm.load_numbers('../data/fm_train_real.dat')) testdat = double(lm.load_numbers('../data/fm_test_real.dat')) traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,traindna,testdna],[traindat,testdat,traindna,testdna]] def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Combined') kernel_combined_modular(*parameter_list[0])
# This is an example for the initialization of the CommUlongString-kernel. This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat =lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,3,0,False ],[traindat,testdat,4,0,False]] def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False): from modshogun import CommUlongStringKernel from modshogun import StringUlongFeatures, StringCharFeatures, DNA from modshogun import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('CommUlongString') kernel_comm_ulong_string_modular(*parameter_list[0])
# This is an example for the initialization of the CommWordString-kernel (aka # Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,4,0,False, False],[traindat,testdat,4,0,False,False]] def kernel_comm_word_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse = False, use_sign = False): from modshogun import CommWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import SortWordString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('CommWordString') kernel_comm_word_string_modular(*parameter_list[0])
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # #!/usr/bin/env python parameter_list =[[23],[24]] def kernel_const_modular (c=23): from modshogun import DummyFeatures from modshogun import ConstKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) kernel=ConstKernel(feats_train, feats_train, c) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Const') kernel_const_modular(*parameter_list[0])
# A user defined custom kernel is assigned in this example, for which only the lower triangle # may be given (set_triangle_kernel_matrix_from_triangle) or # a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a # triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and # the svm is used to classify the examples. # #!/usr/bin/env python from numpy.random import seed seed(42) parameter_list=[[7],[8]] def kernel_custom_modular (dim=7): from numpy.random import rand, seed from numpy import array, float32, int32 from modshogun import RealFeatures from modshogun import CustomKernel from modshogun import IndexFeatures seed(17) data=rand(dim, dim) feats=RealFeatures(data) symdata=data+data.T lowertriangle=array([symdata[(x,y)] for x in range(symdata.shape[1]) for y in range(symdata.shape[0]) if y<=x]) kernel=CustomKernel() # once with float64's kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(symdata) km_fullfull=kernel.get_kernel_matrix() # get subset of kernel row_idx=array(range(3),dtype=int32) col_idx=array(range(2),dtype=int32) row_idx_feat=IndexFeatures(row_idx) col_idx_feat=IndexFeatures(col_idx) kernel.init(row_idx_feat, col_idx_feat) km_sub_kernel=kernel.get_kernel_matrix() # print('Subkernel(3x2):\n%s'%km_sub_kernel) kernel.remove_all_col_subsets() kernel.remove_all_row_subsets() # now once with float32's data=array(data,dtype=float32) kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(symdata) km_fullfull=kernel.get_kernel_matrix() return km_fullfull,kernel,km_sub_kernel if __name__=='__main__': print('Custom') kernel_custom_modular(*parameter_list[0])
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. #!/usr/bin/env python parameter_list =[[23],[24]] def kernel_diag_modular (diag=23): from modshogun import DummyFeatures from modshogun import DiagKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) kernel=DiagKernel(feats_train, feats_train, diag) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Diag') kernel_diag_modular(*parameter_list[0])
#!/usr/bin/env python import numpy from modshogun import RealFeatures, MSG_DEBUG traindat = numpy.random.random_sample((10,10)) testdat = numpy.random.random_sample((10,10)) parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]] def kernel_director_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2): try: from modshogun import DirectorKernel except ImportError: print("recompile shogun with --enable-swig-directors") return class DirectorLinearKernel(DirectorKernel): def __init__(self): DirectorKernel.__init__(self, True) def kernel_function(self, idx_a, idx_b): seq1 = self.get_lhs().get_feature_vector(idx_a) seq2 = self.get_rhs().get_feature_vector(idx_b) return numpy.dot(seq1, seq2) from modshogun import LinearKernel, AvgDiagKernelNormalizer from modshogun import Time feats_train=RealFeatures(fm_train_real) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.parallel.set_num_threads(1) feats_test=RealFeatures(fm_test_real) kernel=LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) dkernel=DirectorLinearKernel() dkernel.set_normalizer(AvgDiagKernelNormalizer(scale)) dkernel.init(feats_train, feats_train) #print "km_train" t=Time() km_train=kernel.get_kernel_matrix() #t1=t.cur_time_diff(True) #print "dkm_train" t=Time() dkm_train=dkernel.get_kernel_matrix() #t2=t.cur_time_diff(True) #print "km_train", km_train #print "dkm_train", dkm_train return km_train, dkm_train if __name__=='__main__': print('DirectorLinear') kernel_director_linear_modular(*parameter_list[0])
# With the distance kernel one can use any of the following distance metrics: # BrayCurtisDistance() # CanberraMetric() # CanberraWordDistance() # ChebyshewMetric() # ChiSquareDistance() # CosineDistance() # Distance() # EuclidianDistance() # GeodesicMetric() # HammingWordDistance() # JensenMetric() # ManhattanMetric() # ManhattanWordDistance() # MinkowskiMetric() # RealDistance() # SimpleDistance() # SparseDistance() # SparseEuclidianDistance() # StringDistance() # TanimotoDistance() # #!/usr/bin/env python testdat = '../data/fm_train_real.dat' traindat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat,1.7],[traindat,testdat,1.8]] def kernel_distance_modular (train_fname=traindat,test_fname=testdat,width=1.7): from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance() kernel=DistanceKernel(feats_train, feats_test, width, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Distance') kernel_distance_modular(*parameter_list[0])
# In this example the distant segments kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,5,5],[traindat,testdat,6,6]] def kernel_distantsegments_modular (fm_train_dna=traindat,fm_test_dna=testdat,delta=5, theta=5): from modshogun import StringCharFeatures, DNA from modshogun import DistantSegmentsKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train, km_test, kernel if __name__=='__main__': print('DistantSegments') kernel_distantsegments_modular(*parameter_list[0])
# In this example the exponential kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0): from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Exponential') kernel_exponential_modular(*parameter_list[0])
# The class FKFeatures implements Fischer kernel features obtained from # two Hidden Markov models. # # It was used in # # K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new # discriminative kernel from probabilistic models. Neural Computation, # 14:2397-2414, 2002. # # which also has the details. # # Note that FK-features are computed on the fly, so to be effective feature # caching should be enabled. # # It inherits its functionality from CSimpleFeatures, which should be # consulted for further reference. # #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,1,4,1e-1,1,0,False,[1,False,True]],[traindat,testdat,label_traindat,3,4,1e-1,1,0,False,[1,False,True]]] fm_hmm_pos=[ traindat[i] for i in where([label_traindat==1])[1] ] fm_hmm_neg=[ traindat[i] for i in where([label_traindat==-1])[1] ] def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False, kargs=[1,False,True]): from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL#, MSG_DEBUG # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print("Fisher Kernel") kernel_fisher_modular(*parameter_list[0])
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindat, testdat,3],[traindat,testdat,4]] def kernel_fixed_degree_string_modular (fm_train_dna=traindat, fm_test_dna=testdat,degree=3): from modshogun import StringCharFeatures, DNA from modshogun import FixedDegreeStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=FixedDegreeStringKernel(feats_train, feats_train, degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('FixedDegreeString') kernel_fixed_degree_string_modular(*parameter_list[0])
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.3],[traindat,testdat, 1.4]] def kernel_gaussian_modular (train_fname=traindat,test_fname=testdat, width=1.3): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Gaussian') kernel_gaussian_modular(*parameter_list[0])
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel. # The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step # size of the shifts and max_shift is the maximal shift. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat,1.8,2,1],[traindat,testdat,1.9,2,1]] def kernel_gaussian_shift_modular (train_fname=traindat,test_fname=testdat,width=1.8,max_shift=2,shift_step=1): from modshogun import RealFeatures, GaussianShiftKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('GaussianShift') kernel_gaussian_shift_modular(*parameter_list[0])
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindat,testdat,label_traindat,1,1e1, 1e0],[traindat,testdat,label_traindat,1,1e4,1e4]] def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from modshogun import PluginEstimate#, MSG_DEBUG charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, 0, False) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, 0, False) pie=PluginEstimate(ppseudo_count,npseudo_count) labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('PluginEstimate w/ HistogramWord') kernel_histogram_word_string_modular(*parameter_list[0])
# In this example the inverse multiquadic kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_inversemultiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from modshogun import RealFeatures, InverseMultiQuadricKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=InverseMultiQuadricKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('InverseMultiquadric') kernel_inversemultiquadric_modular(*parameter_list[0])
# example on saving a kernel to a file #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat,1.9],[traindat,testdat,1.7]] def kernel_io_modular (train_fname=traindat,test_fname=testdat,width=1.9): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() f=CSVFile("tmp/gaussian_train.csv","w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() f=CSVFile("tmp/gaussian_test.csv","w") kernel.save(f) del f #clean up import os os.unlink("tmp/gaussian_test.csv") os.unlink("tmp/gaussian_train.csv") return km_train, km_test, kernel if __name__=='__main__': print('Gaussian') kernel_io_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on raw byte # data. #!/usr/bin/env python traindat = '../data/fm_train_byte.dat' testdat = '../data/fm_test_byte.dat' parameter_list=[[traindat,testdat],[traindat,testdat]] def kernel_linear_byte_modular (train_fname=traindat,test_fname=testdat): from modshogun import LinearKernel, ByteFeatures, CSVFile feats_train=ByteFeatures(CSVFile(train_fname)) feats_test=ByteFeatures(CSVFile(test_fname)) kernel=LinearKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return kernel if __name__=='__main__': print('LinearByte') kernel_linear_byte_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on real valued # data using scaling factor 1.2. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.4]] def kernel_linear_modular (train_fname=traindat,test_fname=testdat,scale=1.2): from modshogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Linear') kernel_linear_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on string data. The # strings are all of the same length and consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat): from modshogun import StringCharFeatures, DNA from modshogun import LinearStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LinearStringKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': from tools.load import LoadMatrix print('LinearString') kernel_linear_string_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte) # data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import ushort lm=LoadMatrix() traindat = ushort(lm.load_numbers('../data/fm_train_word.dat')) testdat = ushort(lm.load_numbers('../data/fm_test_word.dat')) parameter_list=[[traindat,testdat,1.2],[traindat,testdat,1.2]] def kernel_linear_word_modular (fm_train_word=traindat,fm_test_word=testdat,scale=1.2): from modshogun import LinearKernel, AvgDiagKernelNormalizer from modshogun import WordFeatures feats_train=WordFeatures(fm_train_word) feats_test=WordFeatures(fm_test_word) kernel=LinearKernel(feats_train, feats_train) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return kernel if __name__=='__main__': print('LinearWord') kernel_linear_word_modular(*parameter_list[0])
# This is an example for the initialization of the local alignment kernel on # DNA sequences, where each column of the matrices of type char corresponds to # one training/test example. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def kernel_local_alignment_string_modular (fm_train_dna=traindat,fm_test_dna=testdat): from modshogun import StringCharFeatures, DNA from modshogun import LocalAlignmentStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LocalAlignmentStringKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('LocalAlignmentString') kernel_local_alignment_string_modular(*parameter_list[0])
# The LocalityImprovedString kernel is inspired by the polynomial kernel. # Comparing neighboring characters it puts emphasize on local features. # # It can be defined as # K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2}, # where # I_i({\bf x},{\bf x'})=1 # if $x_i=x'_i and 0 otherwise. # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindat,testdat,5,5,7],[traindat,testdat,5,5,7]] def kernel_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7): from modshogun import StringCharFeatures, DNA from modshogun import LocalityImprovedStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('LocalityImprovedString') kernel_locality_improved_string_modular(*parameter_list[0])
# In this example the log kernel (logarithm of the distance powered by degree plus one) is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]] def kernel_log_modular (train_fname=traindat,test_fname=testdat, degree=2.0): from modshogun import RealFeatures, LogKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=LogKernel(feats_train, feats_train, degree, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Log') kernel_log_modular(*parameter_list[0])
# In this example the match word string kernel is being computed for toy data #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat, 3,1.4,10,3,0,False],[ traindat,testdat, 3,1.4,10,3,0,False]] def kernel_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False): from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('MatchWordString') kernel_match_word_string_modular(*parameter_list[0])
# In this example the multiquadric kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_multiquadric_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from modshogun import RealFeatures, MultiquadricKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=MultiquadricKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Multiquadric') kernel_multiquadric_modular(*parameter_list[0])
# This is an example initializing the oligo string kernel which takes distances # between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length # of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is # implemented for the DNA-alphabet 'ACGT'. # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,3,1.2,10],[traindat,testdat,4,1.3,10]] def kernel_oligo_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,k=3,width=1.2,size_cache=10): from modshogun import StringCharFeatures, DNA from modshogun import OligoStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=OligoStringKernel(size_cache, k, width) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('OligoString') kernel_oligo_string_modular(*parameter_list[0])
# In this example the poly match string kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,3,False],[traindat,testdat,4,False]] def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False): from modshogun import PolyMatchStringKernel from modshogun import StringCharFeatures, DNA feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_train_dna, DNA) kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('PolyMatchString') kernel_poly_match_string_modular(*parameter_list[0])
# This is an example for the initialization of the PolyMatchString kernel on string data. # The PolyMatchString kernel sums over the matches of two stings of the same length and # takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,2,True,3,0,False],[traindat,testdat,2,True,3,0,False]] def kernel_poly_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, degree=2,inhomogene=True,order=3,gap=0,reverse=False): from modshogun import PolyMatchWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('PolyMatchWordString') kernel_poly_match_word_string_modular(*parameter_list[0])
# This example initializes the polynomial kernel with real data. # If variable 'inhomogene' is 'True' +1 is added to the scalar product # before taking it to the power of 'degree'. If 'use_normalization' is # set to 'true' then kernel matrix will be normalized by the square roots # of the diagonal entries. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,4,False,True],[traindat,testdat,5,False,True]] def kernel_poly_modular (train_fname=traindat,test_fname=testdat,degree=4,inhomogene=False, use_normalization=True): from modshogun import RealFeatures, PolyKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=PolyKernel( feats_train, feats_train, degree, inhomogene, use_normalization) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Poly') kernel_poly_modular (*parameter_list[0])
# In this example the power kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]] def kernel_power_modular (train_fname=traindat,test_fname=testdat, degree=2.0): from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=PowerKernel(feats_train, feats_train, degree, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Power') kernel_power_modular(*parameter_list[0])
# In this example the rational quadratic kernel is being computed for toy data. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('RationalQuadratic') kernel_rationalquadratic_modular(*parameter_list[0])
# The SalzbergWordString kernel implements the Salzberg kernel. # # It is described in # # Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites # A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,3,0,False],[traindat,testdat,label_traindat,3,0,False]] def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat, order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import SalzbergWordStringKernel from modshogun import PluginEstimate charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('PluginEstimate w/ SalzbergWord') kernel_salzberg_word_string_modular(*parameter_list[0])
# The standard Sigmoid kernel computed on dense real valued features. #!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' parameter_list = [[traindat,testdat,10,1.2,1.3],[traindat,testdat,10,1.2,1.3]] def kernel_sigmoid_modular (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3): from modshogun import RealFeatures, SigmoidKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Sigmoid') kernel_sigmoid_modular(*parameter_list[0])
# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,5,5,1],[traindat,testdat,5,3,2]] def kernel_simple_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, length=5,inner_degree=5,outer_degree=1 ): from modshogun import StringCharFeatures, DNA from modshogun import SimpleLocalityImprovedStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=SimpleLocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('SimpleLocalityImprovedString') kernel_simple_locality_improved_string_modular(*parameter_list[0])
# This example demonstrates how to use the Gaussian Kernel with sparse features. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]] def kernel_sparse_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.1 ): from modshogun import SparseRealFeatures from modshogun import GaussianKernel feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('SparseGaussian') kernel_sparse_gaussian_modular (*parameter_list[0])
# This example demonstrates how to use the Linear Kernel with sparse features. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,1.1],[traindat,testdat,1.2]] def kernel_sparse_linear_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.1): from modshogun import SparseRealFeatures from modshogun import LinearKernel, AvgDiagKernelNormalizer feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) kernel=LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('SparseLinear') kernel_sparse_linear_modular(*parameter_list[0])
# This example shows how to use the polynomial kernel with sparse features. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,10,3,True],[traindat,testdat,10,4,True]] def kernel_sparse_poly_modular (fm_train_real=traindat,fm_test_real=testdat, size_cache=10,degree=3,inhomogene=True ): from modshogun import SparseRealFeatures from modshogun import PolyKernel feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) kernel=PolyKernel(feats_train, feats_train, size_cache, inhomogene, degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('SparsePoly') kernel_sparse_poly_modular(*parameter_list[0])
# In this example the spherical kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 5.0]] def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0): from modshogun import RealFeatures from modshogun import MultiquadricKernel from modshogun import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Spherical') kernel_spherical_modular(*parameter_list[0])
# In this example the spline kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def kernel_spline_modular (fm_train_real=traindat,fm_test_real=testdat): from modshogun import RealFeatures from modshogun import SplineKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=SplineKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Spline') kernel_spline_modular(*parameter_list[0])
# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2014 Soumyajit De # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,2,0.75],[traindat,testdat,3,0.75]] def kernel_ssk_string_modular (fm_train_dna=traindat, fm_test_dna=testdat, maxlen=1, decay=1): from modshogun import SubsequenceStringKernel from modshogun import StringCharFeatures, DNA feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=SubsequenceStringKernel(feats_train, feats_train, maxlen, decay) km_train=kernel.get_kernel_matrix() # print(km_train) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() # print(km_test) return km_train,km_test,kernel if __name__=='__main__': print('SubsequenceStringKernel DNA') kernel_ssk_string_modular(*parameter_list[0]) kernel_ssk_string_modular(*parameter_list[1])
# The class TOPFeatures implements TOP kernel features obtained from # two Hidden Markov models. # # It was used in # # K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new # discriminative kernel from probabilistic models. Neural Computation, # 14:2397-2414, 2002. # # which also has the details. # # Note that TOP-features are computed on the fly, so to be effective feature # caching should be enabled. # # It inherits its functionality from CSimpleFeatures, which should be # consulted for further reference. # #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') fm_hmm_pos=[traindat[i] for i in where([label_traindat==1])[1] ] fm_hmm_neg=[traindat[i] for i in where([label_traindat==-1])[1] ] parameter_list = [[traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True]], \ [traindat,testdat,label_traindat,1e-1,1,0,False,[1, False, True] ]] def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1, order=1,gap=0,reverse=False,kargs=[1, False, True]): from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=TOPFeatures(10, pos, neg, False, False) kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print("TOP Kernel") kernel_top_modular(*parameter_list[0])
# In this example the t-Student's kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat, 2.0],[traindat,testdat, 3.0]] def kernel_tstudent_modular (fm_train_real=traindat,fm_test_real=testdat, degree=2.0): from modshogun import RealFeatures from modshogun import TStudentKernel from modshogun import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=TStudentKernel(feats_train, feats_train, degree, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('TStudent') kernel_tstudent_modular(*parameter_list[0])
# In this example the wave kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat, 1.0],[traindat,testdat, 10.0]] def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0): from modshogun import RealFeatures from modshogun import WaveKernel from modshogun import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=WaveKernel(feats_train, feats_train, theta, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Wave') kernel_wave_modular(*parameter_list[0])
# In this example the wavelet kernel is being computed for toy data. #!/usr/bin/env python from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat, 1.5, 1.0],[traindat,testdat, 1.0, 1.5]] def kernel_wavelet_modular (fm_train_real=traindat,fm_test_real=testdat, dilation=1.5, translation=1.0): from modshogun import RealFeatures from modshogun import WaveletKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=WaveletKernel(feats_train, feats_train, 10, dilation, translation) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('Wavelet') kernel_wavelet_modular(*parameter_list[0])
# The WeightedCommWordString kernel may be used to compute the weighted # spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer # length is weighted by some coefficient \f$\beta_k\f$) from strings that have # been mapped into unsigned 16bit integers. # # These 16bit integers correspond to k-mers. To applicable in this kernel they # need to be sorted (e.g. via the SortWordString pre-processor). # # It basically uses the algorithm in the unix "comm" command (hence the name) # to compute: # # k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'}) # # where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in # \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature # vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$. # # Note that this representation is especially tuned to small alphabets # (like the 2-bit alphabet DNA), for which it enables spectrum kernels # of order 8. # # For this kernel the linadd speedups are quite efficiently implemented using # direct maps. # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat],[traindat,testdat]] def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from modshogun import WeightedCommWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('WeightedCommWordString') kernel_weighted_comm_word_string_modular(*parameter_list[0])
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts). # # The WD-shift kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d, weighted by coefficients beta_k # allowing for a positional tolerance of up to shift s. # #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,20],[traindat,testdat,22]] def kernel_weighted_degree_position_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20): from modshogun import StringCharFeatures, DNA from modshogun import WeightedDegreePositionStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree) from numpy import zeros,ones,float64,int32 kernel.set_shifts(10*ones(len(fm_train_dna[0]), dtype=int32)) kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64)) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('WeightedDegreePositionString') kernel_weighted_degree_position_string_modular(*parameter_list[0])
# This examples shows how to create a Weighted Degree String Kernel from data # and how to compute the kernel matrix from the resulting object. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindat,testdat,3],[traindat,testdat,20]] def kernel_weighted_degree_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=20): from modshogun import StringCharFeatures, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) from numpy import arange,double weights=arange(1,degree+1,dtype=double)[::-1]/ \ sum(arange(1,degree+1,dtype=double)) kernel.set_wd_weights(weights) #from numpy import ones,float64,int32 #kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64)) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() #this is how to serializate the kernel #import pickle #pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2) #k=pickle.load(file('tmp/kernel_obj.dump','r')) return km_train, km_test, kernel if __name__=='__main__': print('WeightedDegreeString') kernel_weighted_degree_string_modular(*parameter_list[0])
#!/usr/bin/env python parameter_list=[[]] def labels_io_modular(): from modshogun import RegressionLabels, CSVFile lab=RegressionLabels() f=CSVFile("../data/label_train_regression.dat","r") f.set_delimiter(" ") lab.load(f) #print lab.get_labels() return lab if __name__=='__main__': print('Labels IO') labels_io_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import * from modshogun import * x=array([[20.0,15,15],[10,20,20]]) y=array([[21.0,21,18],[19,19,22]]) z=array([[15.0,27,18],[32,5,23]]) parameter_list = [[x,concatenate((x,y,z),1)]] def library_fisher2x3_modular (table, tables): pval=Statistics_fishers_exact_test_for_2x3_table(table) pvals=Statistics_fishers_exact_test_for_multiple_2x3_tables(tables) return (pval,pvals) if __name__=='__main__': print('Fisher 2x3') library_fisher2x3_modular(*parameter_list[0])
#!/usr/bin/env python import time from modshogun import Time parameter_list = [[5],[1.0]] def library_time (sleep_secs): # measure wall clock time difference t=Time() time.sleep(sleep_secs) diff=t.cur_time_diff() # measure CPU time required cpu_diff=t.cur_runtime_diff_sec() # wall clock time should be above sleep_secs # but cpu time should be tiny #print diff, cpu_diff return diff>sleep_secs, cpu_diff<0.5 if __name__=='__main__': print('Time') library_time(*parameter_list[0])
#!/usr/bin/env python import numpy from scipy.io import mmread # Loading an example sparse matrix of dimension 479x479, real, unsymmetric mtx=mmread('../../../data/logdet/west0479.mtx') parameter_list=[[mtx,6000,10]] def mathematics_linsolver_cg (matrix=mtx,max_iter=1000,seed=10): # Create a Hermitian sparse matrix from scipy.sparse import eye rows=matrix.shape[0] cols=matrix.shape[1] A=matrix.transpose()*matrix+eye(rows, cols) # Create a random vector (b) of the system Ax=b numpy.random.seed(seed) b=numpy.array(numpy.random.randn(rows)) # create linear system with linear operator and vector from scipy.sparse import csc_matrix try: from shogun.Mathematics import RealSparseMatrixOperator from shogun.Mathematics import ConjugateGradientSolver op=RealSparseMatrixOperator(A.tocsc()) solver=ConjugateGradientSolver() # set the iteration limit higher for poorly conditioned matrices solver.set_iteration_limit(max_iter) x=solver.solve(op, b) # verifying the solution via direct solving from scipy.sparse.linalg import spsolve, eigsh y=spsolve(A,b) print(numpy.linalg.norm(x-y)) return x except ImportError: print('Shogun not installed with Eigen3!') if __name__=='__main__': print('CG') mathematics_linsolver_cg (*parameter_list[0])
#!/usr/bin/env python from numpy import * from scipy.io import mmread # Loading an example sparse matrix of dimension 479x479, real, unsymmetric mtx=mmread('../../../data/logdet/west0479.mtx') parameter_list=[[mtx,100,60,1]] def mathematics_logdet (matrix=mtx,max_iter_eig=1000,max_iter_lin=1000,num_samples=1): from scipy.sparse import eye # Create a Hermitian sparse matrix rows=matrix.shape[0] cols=matrix.shape[1] A=matrix.transpose()*matrix+eye(rows, cols) from scipy.sparse import csc_matrix try: from shogun.Mathematics import RealSparseMatrixOperator from shogun.Mathematics import LanczosEigenSolver from shogun.Mathematics import CGMShiftedFamilySolver from shogun.Mathematics import LogRationalApproximationCGM from shogun.Mathematics import ProbingSampler from shogun.Mathematics import LogDetEstimator from shogun.Mathematics import Statistics from shogun.Library import SerialComputationEngine # creating the linear operator, eigen-solver op=RealSparseMatrixOperator(A.tocsc()) eig_solver=LanczosEigenSolver(op) # we can set the iteration limit high for poorly conditioned matrices eig_solver.set_max_iteration_limit(max_iter_eig) # alternatively, if the matrix is small, we can compute eigenvalues externally # and set min/max eigenvalues into the eigensolver # from scipy.sparse.linalg import eigsh # eigenvalues=eigsh(A, rows-1) # eig_solver.set_min_eigenvalue(eigenvalues[0][0]) # eig_solver.set_max_eigenvalue(eigenvalues[0][-1]) # create the shifted-family linear solver which solves for all the shifts # using as many matrix-vector products as one shift in CG iterations lin_solver=CGMShiftedFamilySolver() lin_solver.set_iteration_limit(max_iter_lin) # computation engine engine=SerialComputationEngine() # set the desired accuracy tighter to obtain better results # this determines the number of contour points in conformal mapping of # the rational approximation of the Cauchy's integral of f(A)*s, f=log desired_accuracy=1E-5 # creating the log-linear-operator function op_func=LogRationalApproximationCGM(op, engine, eig_solver, lin_solver,\ desired_accuracy) # set the trace sampler to be probing sampler, in which samples are obtained # by greedy graph coloring of the power of sparse matrix (default is power=1, # 2-distance coloring) trace_sampler=ProbingSampler(op) # estimating log-det log_det_estimator=LogDetEstimator(trace_sampler, op_func, engine) # set the number of samples as required estimates=log_det_estimator.sample(num_samples) estimated_logdet=sum(estimates)/len(estimates) actual_logdet=Statistics.log_det(A) print(actual_logdet, estimated_logdet) return estimates except ImportError: print('One or many of the dependencies (Eigen3/LaPack/ColPack) not found!') if __name__=='__main__': print('LogDetEstimator') mathematics_logdet (*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data,0.0],[data,1.0]] def mathematics_sparseinversecovariance_modular (data,lc): try: from modshogun import SparseInverseCovariance except ImportError: print("SparseInverseCovariance not available") exit(0) from numpy import dot sic = SparseInverseCovariance() S = dot(data,data.T) Si = sic.estimate(S,lc) return Si if __name__=='__main__': print('SparseInverseCovariance') mathematics_sparseinversecovariance_modular(*parameter_list[0])
#!/usr/bin/env python traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' parameter_list = [[traindat,testdat,label_traindat,3]] def metric_lmnn_modular(train_fname=traindat,test_fname=testdat,label_train_fname=label_traindat,k=3): try: from modshogun import RealFeatures,MulticlassLabels,LMNN,KNN,CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn=LMNN(feats_train,labels,k) lmnn.train() lmnn_distance=lmnn.get_distance() # perform classification with KNN knn=KNN(k,lmnn_distance,labels) knn.train() output=knn.apply(feats_test).get_labels() return lmnn,output if __name__=='__main__': print('LMNN') metric_lmnn_modular(*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL) # with the modular interface. First, we create a number of base kernels. # These kernels can capture different views of the same features, or actually # consider entirely different features associated with the same example # (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample). # The base kernels are then subsequently added to a CombinedKernel, which # contains a weight for each kernel and encapsulates the base kernels # from the training procedure. When the CombinedKernel between two examples is # evaluated it computes the corresponding linear combination of kernels according to their weights. # We then show how to create an MKLClassifier that trains an SVM and learns the optimal # weighting of kernels (w.r.t. a given norm q) at the same time. # Finally, the example shows how to classify with a trained MKLClassifier. # #!/usr/bin/env python from modshogun import CombinedFeatures, RealFeatures, BinaryLabels from modshogun import CombinedKernel, PolyKernel, CustomKernel from modshogun import MKLClassification from tools.load import LoadMatrix lm=LoadMatrix() #only run example if SVMLight is included as LibSVM solver crashes in MKLClassification try: from modshogun import SVMLight except ImportError: print("SVMLight not available") exit(0) traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat],[traindat,testdat,label_traindat]] # fm_train_real.shape # fm_test_real.shape # combined_custom() def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10,2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() #w=kernel.get_subkernel_weights() #kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(),kernel if __name__=='__main__': mkl_binclass_modular (*parameter_list[0])
# In this example we show how to perform Multiple Kernel Learning (MKL) # with the modular interface for multi-class classification. # First, we create a number of base kernels and features. # These kernels can capture different views of the same features, or actually # consider entirely different features associated with the same example # (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample). # The base kernels are then subsequently added to a CombinedKernel, which # contains a weight for each kernel and encapsulates the base kernels # from the training procedure. When the CombinedKernel between two examples is # evaluated it computes the corresponding linear combination of kernels according to their weights. # We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal # weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary # classification version of MKL is that we can use more than two values as labels, when training # the classifier. # Finally, the example shows how to classify with a trained MKLMultiClass classifier. # #!/usr/bin/env python from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[ [ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5], [ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2]] def mkl_multiclass_modular (fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from modshogun import CombinedFeatures, RealFeatures, MulticlassLabels from modshogun import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from modshogun import MKLMulticlass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out if __name__ == '__main__': print('mkl_multiclass') mkl_multiclass_modular(*parameter_list[0])
#!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # from numpy import array from numpy import random import math from modshogun import CrossValidation, CrossValidationResult from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import BinaryLabels from modshogun import RealFeatures from modshogun import GaussianKernel, PowerKernel from modshogun import LibSVM from modshogun import MinkowskiMetric from modshogun import GridSearchModelSelection from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR from modshogun import ParameterCombination from modshogun import Math def create_param_tree(): root=ModelSelectionParameters() c1=ModelSelectionParameters("C1") root.append_child(c1) c1.build_values(-1.0, 1.0, R_EXP) c2=ModelSelectionParameters("C2") root.append_child(c2) c2.build_values(-1.0, 1.0, R_EXP) gaussian_kernel=GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width=ModelSelectionParameters("log_width") gaussian_kernel_width.build_values(-math.log(2.0), 0.0, R_EXP, 1.0, 2.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) power_kernel=PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #power_kernel.print_modsel_params() param_power_kernel=ModelSelectionParameters("kernel", power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree=ModelSelectionParameters("degree") param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR) param_power_kernel.append_child(param_power_kernel_degree) metric=MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #metric.print_modsel_params() param_power_kernel_metric1=ModelSelectionParameters("distance", metric) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k=ModelSelectionParameters("k") param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) return root parameter_list = [[3,20,3]] def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1); # create some (non-sense) data matrix=random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() #param_tree.print_tree() grid_search=GridSearchModelSelection(cross, param_tree) print_state=False best_combination=grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have less variance cross.set_num_runs(10) result=cross.evaluate() casted=CrossValidationResult.obtain_from_generic(result); #print "result mean:", casted.mean return classifier,result,casted.mean if __name__=='__main__': print('ModelselectionGridSearchKernel') modelselection_grid_search_kernel(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \ [traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]] def modelselection_grid_search_krr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from modshogun import CrossValidation, CrossValidationResult from modshogun import MeanSquaredError from modshogun import CrossValidationSplitting from modshogun import RegressionLabels from modshogun import RealFeatures from modshogun import KernelRidgeRegression from modshogun import GridSearchModelSelection from modshogun import ModelSelectionParameters # training data features_train=RealFeatures(traindat) features_test=RealFeatures(testdat) labels=RegressionLabels(label_traindat) # labels labels=RegressionLabels(label_train) # predictor, set tau=0 here, doesnt matter predictor=KernelRidgeRegression() # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=MeanSquaredError() # cross-validation instance cross_validation=CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val (set larger to get better estimates) cross_validation.set_num_runs(2) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select regularization parameter param_tree_root=create_param_tree() # model selection instance model_selection=GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree:" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state=False best_parameters=model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result=cross_validation.evaluate() #print "mean:", result.mean # creates all the parameters to optimize def create_param_tree(): from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR from modshogun import ParameterCombination from modshogun import GaussianKernel, PolyKernel import math root=ModelSelectionParameters() tau=ModelSelectionParameters("tau") root.append_child(tau) # also R_LINEAR/R_LOG is available as type min=-1 max=1 type=R_EXP step=1.5 base=2 tau.build_values(min, max, type, step, base) # gaussian kernel with width gaussian_kernel=GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width=ModelSelectionParameters("log_width"); gaussian_kernel_width.build_values(2.0*math.log(2.0), 2.5*math.log(2.0), R_LINEAR, 1.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) # polynomial kernel with degree poly_kernel=PolyKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #poly_kernel.print_modsel_params() param_poly_kernel=ModelSelectionParameters("kernel", poly_kernel) root.append_child(param_poly_kernel) # note that integers are used here param_poly_kernel_degree=ModelSelectionParameters("degree") param_poly_kernel_degree.build_values(1, 2, R_LINEAR) param_poly_kernel.append_child(param_poly_kernel_degree) return root if __name__=='__main__': print('ModelselectionGridSearchKRR') modelselection_grid_search_krr_modular(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2011 Heiko Strathmann # Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society # from numpy.random import randn from numpy import * # generate some overlapping training vectors num_vectors=100 vec_distance=1 traindat=concatenate((randn(2,num_vectors)-vec_distance, randn(2,num_vectors)+vec_distance), axis=1) label_traindat=concatenate((-ones(num_vectors), ones(num_vectors))); parameter_list = [[traindat,label_traindat]] def modelselection_grid_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import GridSearchModelSelection from modshogun import ModelSelectionParameters, R_EXP from modshogun import ParameterCombination from modshogun import BinaryLabels from modshogun import RealFeatures from modshogun import LibLinear, L2R_L2LOSS_SVC # build parameter tree to select C1 and C2 param_tree_root=ModelSelectionParameters() c1=ModelSelectionParameters("C1"); param_tree_root.append_child(c1) c1.build_values(-1.0, 0.0, R_EXP); c2=ModelSelectionParameters("C2"); param_tree_root.append_child(c2); c2.build_values(-1.0, 0.0, R_EXP); # training data features=RealFeatures(traindat) labels=BinaryLabels(label_traindat) # classifier classifier=LibLinear(L2R_L2LOSS_SVC) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #classifier.print_modsel_params() # splitting strategy for cross-validation splitting_strategy=StratifiedCrossValidationSplitting(labels, 10) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # model selection instance model_selection=GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #param_tree_root.print_tree() best_parameters=model_selection.select_model() # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(classifier) result=cross_validation.evaluate() #result.print_result() if __name__=='__main__': print('ModelSelectionGridSearchLibLinear') modelselection_grid_search_liblinear_modular(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2012 Heiko Strathmann # Copyright (C) 2012 Berlin Institute of Technology and Max-Planck-Society # from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5,1e-2], \ [traindat,testdat,label_traindat,2.1,1,1e-5,1e-2]] def modelselection_grid_search_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from modshogun import CrossValidation, CrossValidationResult from modshogun import MeanSquaredError from modshogun import CrossValidationSplitting from modshogun import RegressionLabels from modshogun import RealFeatures from modshogun import GaussianKernel from modshogun import LibSVR from modshogun import GridSearchModelSelection from modshogun import ModelSelectionParameters, R_EXP from modshogun import ParameterCombination # training data features_train=RealFeatures(traindat) labels=RegressionLabels(label_traindat) # kernel kernel=GaussianKernel(features_train, features_train, width) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #kernel.print_modsel_params() labels=RegressionLabels(label_train) # predictor predictor=LibSVR(C, tube_epsilon, kernel, labels) predictor.set_epsilon(epsilon) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=MeanSquaredError() # cross-validation instance cross_validation=CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val (set larger to get better estimates) cross_validation.set_num_runs(2) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select C1 and C2 param_tree_root=ModelSelectionParameters() c1=ModelSelectionParameters("C1"); param_tree_root.append_child(c1) c1.build_values(-1.0, 0.0, R_EXP); c2=ModelSelectionParameters("C2"); param_tree_root.append_child(c2); c2.build_values(-1.0, 0.0, R_EXP); # model selection instance model_selection=GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state=False # lock data before since model selection will not change the kernel matrix # (use with care) This avoids that the kernel matrix is recomputed in every # iteration of the model search predictor.data_lock(labels, features_train) best_parameters=model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result=cross_validation.evaluate() #print "mean:", result.mean if __name__=='__main__': print('ModelselectionGridSearchLibSVR') modelselection_grid_search_libsvr_modular(*parameter_list[0])
# In this example a complex model parameters selection tree # is being constructed #!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2011-2012 Heiko Strathmann # Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society # parameter_list=[[None]] def modelselection_parameter_tree_modular (dummy): from modshogun import ParameterCombination from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR from modshogun import PowerKernel from modshogun import GaussianKernel from modshogun import DistantSegmentsKernel from modshogun import MinkowskiMetric import math root=ModelSelectionParameters() combinations=root.get_combinations() combinations.get_num_elements() c=ModelSelectionParameters('C'); root.append_child(c) c.build_values(1, 11, R_EXP) power_kernel=PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #power_kernel.print_modsel_params() param_power_kernel=ModelSelectionParameters('kernel', power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree=ModelSelectionParameters('degree') param_power_kernel_degree.build_values(1, 1, R_EXP) param_power_kernel.append_child(param_power_kernel_degree) metric1=MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #metric1.print_modsel_params() param_power_kernel_metric1=ModelSelectionParameters('distance', metric1) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k=ModelSelectionParameters('k') param_power_kernel_metric1_k.build_values(1, 12, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) gaussian_kernel=GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel=ModelSelectionParameters('kernel', gaussian_kernel) root.append_child(param_gaussian_kernel) param_gaussian_kernel_width=ModelSelectionParameters('log_width') param_gaussian_kernel_width.build_values(0.0, 0.5*math.log(2.0), R_LINEAR) param_gaussian_kernel.append_child(param_gaussian_kernel_width) ds_kernel=DistantSegmentsKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #ds_kernel.print_modsel_params() param_ds_kernel=ModelSelectionParameters('kernel', ds_kernel) root.append_child(param_ds_kernel) param_ds_kernel_delta=ModelSelectionParameters('delta') param_ds_kernel_delta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_delta) param_ds_kernel_theta=ModelSelectionParameters('theta') param_ds_kernel_theta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_theta) # root.print_tree() combinations=root.get_combinations() # for i in range(combinations.get_num_elements()): # params = ParameterCombination.obtain_from_generic(combinations.get_element(i)) # params.print_tree() return if __name__=='__main__': print('ModelSelection ParameterTree') modelselection_parameter_tree_modular(*parameter_list[0])
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Copyright (C) 2012 Sergey Lisitsyn from numpy import * from numpy.random import randn # generate some overlapping training vectors num_vectors=100 vec_distance=1 traindat=concatenate((randn(2,num_vectors)-vec_distance, randn(2,num_vectors)+vec_distance), axis=1) label_traindat=concatenate((-ones(num_vectors), ones(num_vectors))); parameter_list = [[traindat,label_traindat]] def modelselection_random_search_liblinear_modular (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import RandomSearchModelSelection from modshogun import ModelSelectionParameters, R_EXP from modshogun import ParameterCombination from modshogun import BinaryLabels from modshogun import RealFeatures from modshogun import LibLinear, L2R_L2LOSS_SVC # build parameter tree to select C1 and C2 param_tree_root=ModelSelectionParameters() c1=ModelSelectionParameters("C1"); param_tree_root.append_child(c1) c1.build_values(-2.0, 2.0, R_EXP); c2=ModelSelectionParameters("C2"); param_tree_root.append_child(c2); c2.build_values(-2.0, 2.0, R_EXP); # training data features=RealFeatures(traindat) labels=BinaryLabels(label_traindat) # classifier classifier=LibLinear(L2R_L2LOSS_SVC) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #classifier.print_modsel_params() # splitting strategy for cross-validation splitting_strategy=StratifiedCrossValidationSplitting(labels, 10) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # model selection instance model_selection=RandomSearchModelSelection(cross_validation, param_tree_root, 0.5) # perform model selection with selected methods #print "performing model selection of" #param_tree_root.print_tree() best_parameters=model_selection.select_model() # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(classifier) result=cross_validation.evaluate() #result.print_result() if __name__=='__main__': print('ModelSelectionRandomSearchLibLinear') modelselection_random_search_liblinear_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' # set both input attributes as not nominal (ie. continuous) feattypes = array([False, False]) parameter_list = [[traindat,testdat,label_traindat,feattypes]] def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset=int32(random.permutation(feats_train.get_num_vectors())) vsubset=subset[1:subset.size/3] trsubset=subset[1+subset.size/3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c=C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train,train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output=c.apply_multiclass(feats_test).get_labels() output_certainty=c.get_certainty_vector() return c,output,output_certainty if __name__=='__main__': print('C45ClassifierTree') multiclass_c45classifiertree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' # set both input attributes as not nominal (ie. continuous) feattypes = array([False, False]) parameter_list = [[traindat,testdat,label_traindat,feattypes]] def multiclass_cartree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, CARTree, PT_MULTICLASS except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # CART Tree formation with 5 fold cross-validation pruning c=CARTree(ft,PT_MULTICLASS,5,True) c.set_labels(train_labels) c.train(feats_train) # Classify test data output=c.apply_multiclass(feats_test).get_labels() return c,output if __name__=='__main__': print('CARTree') multiclass_cartree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array, dtype, int32 traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' # set both input attributes as continuous i.e. 2 feattypes = array([2, 2],dtype=int32) parameter_list = [[traindat,testdat,label_traindat,feattypes]] def multiclass_chaidtree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # CHAID Tree formation with nominal dependent variable c=CHAIDTree(0,feattypes,10) c.set_labels(train_labels) c.train(feats_train) # Classify test data output=c.apply_multiclass(feats_test).get_labels() return c,output if __name__=='__main__': print('CHAIDTree') multiclass_chaidtree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array # create data train_data = array([[1.0, 2.0, 1.0, 3.0, 1.0, 3.0, 2.0, 2.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 2.0], [2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0], [3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0], [1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]]) train_labels = array([1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0]) test_data = array([[2.0, 2.0, 1.0, 3.0, 3.0], [2.0, 1.0, 2.0, 1.0, 2.0], [3.0, 2.0, 1.0, 3.0, 2.0], [1.0, 2.0, 1.0, 2.0, 1.0]]) parameter_list = [[train_data, train_labels, test_data]] def multiclass_id3classifiertree_modular(train=train_data,labels=train_labels,test=test_data): try: from modshogun import RealFeatures, MulticlassLabels, ID3ClassifierTree except ImportError: return # wrap features and labels into Shogun objects feats_train=RealFeatures(train) feats_test=RealFeatures(test) feats_labels=MulticlassLabels(labels) # ID3 Tree formation id3=ID3ClassifierTree() id3.set_labels(feats_labels) id3.train(feats_train) # Classify test data output=id3.apply_multiclass(feats_test).get_labels() return id3,output if __name__=='__main__': print('ID3ClassifierTree') multiclass_id3classifiertree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' # set both input attributes as not nominal (ie. continuous) feattypes = array([False, False]) parameter_list = [[traindat,testdat,label_traindat,feattypes]] def multiclass_randomforest_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # Random Forest formation rand_forest=RandomForest(feats_train,train_labels,20,1) rand_forest.set_feature_types(ft) rand_forest.set_combination_rule(MajorityVote()) rand_forest.train() # Classify test data output=rand_forest.apply_multiclass(feats_test).get_labels() return rand_forest,output if __name__=='__main__': print('RandomForest') multiclass_randomforest_modular(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data, 20], [data, 30]] def preprocessor_dimensionreductionpreprocessor_modular (data, k): from modshogun import RealFeatures from modshogun import DimensionReductionPreprocessor try: from modshogun import LocallyLinearEmbedding except ImportError: print("LocallyLinearEmbedding not available") exit(0) features = RealFeatures(data) converter = LocallyLinearEmbedding() converter.set_k(k) preprocessor = DimensionReductionPreprocessor(converter) preprocessor.init(features) preprocessor.apply_to_feature_matrix(features) return features if __name__=='__main__': print('DimensionReductionPreprocessor') preprocessor_dimensionreductionpreprocessor_modular(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix from modshogun import * lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') labels = lm.load_numbers('../data/label_train_multiclass.dat') parameter_list = [[data, labels, CANVAR_FLDA], [data, labels, CLASSIC_FLDA]] def preprocessor_fisherlda_modular (data, labels, method): from modshogun import RealFeatures, MulticlassLabels, CANVAR_FLDA from modshogun import FisherLda from modshogun import MulticlassLabels sg_features = RealFeatures(data) sg_labels = MulticlassLabels(labels) preprocessor=FisherLda(method) preprocessor.fit(sg_features, sg_labels, 1) yn=preprocessor.apply_to_feature_matrix(sg_features) return yn if __name__=='__main__': print('FisherLda') preprocessor_fisherlda_modular(*parameter_list[0])
# In this example toy data is being processed using the kernel PCA algorithm # as described in # # Schölkopf, B., Smola, A. J., & Muller, K. R. (1999). # Kernel Principal Component Analysis. # Advances in kernel methods support vector learning, 1327(3), 327-352. MIT Press. # Retrieved from http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.8744i # # A gaussian kernel is used for the processing. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data, 0.01, 1.0], [data, 0.05, 2.0]] def preprocessor_kernelpca_modular (data, threshold, width): from modshogun import RealFeatures from modshogun import KernelPCA from modshogun import GaussianKernel features = RealFeatures(data) kernel = GaussianKernel(features,features,width) preprocessor = KernelPCA(kernel) preprocessor.init(features) preprocessor.set_target_dim(2) preprocessor.apply_to_feature_matrix(features) return features if __name__=='__main__': print('KernelPCA') preprocessor_kernelpca_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # LogPlusOne adds one to a dense real-valued vector and takes the logarithm of # each component of it. It is most useful in situations where the inputs are # counts: When one compares differences of small counts any difference may matter # a lot, while small differences in large counts don't. This is what this log # transformation controls for. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat+10,testdat+10,1.4,10],[traindat+10,testdat+10,1.5,10]] def preprocessor_logplusone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import LogPlusOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=LogPlusOne() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('LogPlusOne') preprocessor_logplusone_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # NormOne, normalizes vectors to have norm 1. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,1.4,10],[traindat,testdat,1.5,10]] def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preprocessor=NormOne() preprocessor.init(feats_train) feats_train.add_preprocessor(preprocessor) feats_train.apply_preprocessor() feats_test.add_preprocessor(preprocessor) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('NormOne') preprocessor_normone_modular(*parameter_list[0])
# In this example toy data is being processed using the # Principal Component Analysis. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data]] def preprocessor_pca_modular (data): from modshogun import RealFeatures from modshogun import PCA features = RealFeatures(data) preprocessor = PCA() preprocessor.init(features) preprocessor.apply_to_feature_matrix(features) return features if __name__=='__main__': print('PCA') preprocessor_pca_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # PruneVarSubMean substracts the mean from each feature and removes features that # have zero variance. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]] def preprocessor_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import PruneVarSubMean feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=PruneVarSubMean() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('PruneVarSubMean') preprocessor_prunevarsubmean_modular(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat,testdat,1.5,10],[traindat,testdat,1.5,10]] from modshogun import Math_init_random; Math_init_random(12345); def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import RandomFourierGaussPreproc feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=RandomFourierGaussPreproc() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('RandomFourierGaussPreproc') preprocessor_randomfouriergausspreproc_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The # CommUlongString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 64bit integers. These 64bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortUlongString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation enables # spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for # 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented # (though there is room for improvement here when a whole set of sequences is # ADDed) using sorted lists. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindna,testdna,4,0,False,False],[traindna,testdna,3,0,False,False]] def preprocessor_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from modshogun import CommUlongStringKernel from modshogun import StringCharFeatures, StringUlongFeatures, DNA from modshogun import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('CommUlongString') preprocessor_sortulongstring_modular(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The # CommWordString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 16bit integers. These 16bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortWordString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation is # especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it # enables spectrum kernels of order up to 8. For this kernel the linadd speedups # are quite efficiently implemented using direct maps. #!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindna,testdna,3,0,False,False],[traindna,testdna,3,0,False,False]] def preprocessor_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from modshogun import CommWordStringKernel from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel if __name__=='__main__': print('CommWordString') preprocessor_sortwordstring_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array # set both input attributes as not nominal (ie. continuous) feattypes = array([False]) parameter_list = [[50,5,15,0.2,feattypes]] def regression_cartree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, PT_REGRESSION from numpy import random except ImportError: print("Could not import Shogun and/or numpy modules") return random.seed(1) # form training dataset : y=x with noise X_train=random.rand(1,num_train)*x_range; Y_train=X_train+random.randn(num_train)*noise_var # form test dataset X_test=array([[float(i)/num_test*x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) train_labels=RegressionLabels(Y_train[0]) # CART Tree formation c=CARTree(ft,PT_REGRESSION,5,True) c.set_labels(train_labels) c.train(feats_train) # Classify test data output=c.apply_regression(feats_test).get_labels() return c,output if __name__=='__main__': print('CARTree') regression_cartree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array, dtype, int32 # set input attribute as continuous i.e. 2 feattypes = array([2],dtype=int32) parameter_list = [[500,50,15,0.2,feattypes]] def regression_chaidtree_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION from numpy import random except ImportError: print("Could not import Shogun and/or numpy modules") return random.seed(1) # form training dataset : y=x with noise X_train=random.rand(1,num_train)*x_range; Y_train=X_train+random.randn(num_train)*noise_var # form test dataset X_test=array([[float(i)/num_test*x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) train_labels=RegressionLabels(Y_train[0]) # CHAID Tree formation c=CHAIDTree(2,feattypes,50) c.set_labels(train_labels) c.train(feats_train) # Regress on test data output=c.apply_regression(feats_test).get_labels() return c,output if __name__=='__main__': print('CHAIDTree') regression_chaidtree_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array, random traindat = '../data/fm_train_real.dat' testdat = '../data/fm_test_real.dat' label_traindat = '../data/label_train_multiclass.dat' # set input attribute as not nominal (ie. continuous) feattypes = array([False]) parameter_list = [[500,50,15,0.2,feattypes]] def regression_randomforest_modular(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION except ImportError: print("Could not import Shogun modules") return random.seed(1) # form training dataset : y=x with noise X_train=random.rand(1,num_train)*x_range; Y_train=X_train+random.randn(num_train)*noise_var # form test dataset X_test=array([[float(i)/num_test*x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) train_labels=RegressionLabels(Y_train[0]) # Random Forest formation rand_forest=RandomForest(feats_train,train_labels,20,1) rand_forest.set_feature_types(ft) rand_forest.set_machine_problem_type(PT_REGRESSION) rand_forest.set_combination_rule(MeanRule()) rand_forest.train() # Regress test data output=rand_forest.apply_regression(feats_test).get_labels() return rand_forest,output if __name__=='__main__': print('RandomForest') regression_randomforest_modular(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. The the label of both the train and the test data are # fetched via svr.classify().get_labels(). # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. #!/usr/bin/env python ########################################################################### # svm light based support vector regression ########################################################################### from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,1.2,1,1e-5,1e-2,1],[traindat,testdat,label_traindat,2.3,0.5,1e-5,1e-6,1]] def regression_svrlight_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \ width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3): from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel try: from modshogun import SVRLight except ImportError: print('No support for SVRLight available.') return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=RegressionLabels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) out = svr.apply().get_labels() return out, kernel if __name__=='__main__': print('SVRLight') regression_svrlight_modular(*parameter_list[0])
# In this example serialization of SVM (Support Vector Machine) is shown #!/usr/bin/env python parameter_list=[[10,0.3,2, 1.0, 0.1]] def check_status(status,suffix): # silent... assert status, "ERROR reading/writing status:%s/suffic:%s\n" % (status,suffix) def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from modshogun import RealFeatures, MulticlassLabels from modshogun import GMNPSVM from modshogun import GaussianKernel from modshogun import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG from modshogun import NormOne, LogPlusOne seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) #feats.io.enable_file_and_line() kernel=GaussianKernel(feats, feats, width) labels=MulticlassLabels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) bias_ref = svm.get_svm(0).get_bias() #svm.print_serializable() fstream = SerializableHdf5File("tmp/blaah.h5", "w") status = svm.save_serializable(fstream) check_status(status,'h5') fstream = SerializableAsciiFile("tmp/blaah.asc", "w") status = svm.save_serializable(fstream) check_status(status,'asc') fstream = SerializableJsonFile("tmp/blaah.json", "w") status = svm.save_serializable(fstream) check_status(status,'json') fstream = SerializableXmlFile("tmp/blaah.xml", "w") status = svm.save_serializable(fstream) check_status(status,'xml') fstream = SerializableHdf5File("tmp/blaah.h5", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'h5') new_svm.train() bias_h5 = new_svm.get_svm(0).get_bias() fstream = SerializableAsciiFile("tmp/blaah.asc", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'asc') new_svm.train() bias_asc = new_svm.get_svm(0).get_bias() fstream = SerializableJsonFile("tmp/blaah.json", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'json') new_svm.train() bias_json = new_svm.get_svm(0).get_bias() fstream = SerializableXmlFile("tmp/blaah.xml", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'xml') new_svm.train() bias_xml = new_svm.get_svm(0).get_bias() os.unlink("tmp/blaah.h5") os.unlink("tmp/blaah.asc") os.unlink("tmp/blaah.json") os.unlink("tmp/blaah.xml") return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml if __name__=='__main__': print('Serialization SVMLight') serialization_complex_example(*parameter_list[0])
# In this example dense toy features is being serialized #!/usr/bin/env python from modshogun import * from numpy import array import os parameter_list=[[[[1.0,2,3],[4,5,6]]]] def serialization_matrix_modular (m): feats=RealFeatures(array(m)) #feats.io.set_loglevel(0) fstream = SerializableAsciiFile("tmp/foo.asc", "w") feats.save_serializable(fstream) l=MulticlassLabels(array([1.0,2,3])) fstream = SerializableAsciiFile("tmp/foo2.asc", "w") l.save_serializable(fstream) os.unlink("tmp/foo.asc") os.unlink("tmp/foo2.asc") if __name__=='__main__': print('Serialization Matrix Modular') serialization_matrix_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import WeightedDegreeStringKernel, LinearKernel, PolyKernel, GaussianKernel, CTaxonomy from modshogun import CombinedKernel, WeightedDegreeRBFKernel from modshogun import StringCharFeatures, RealFeatures, CombinedFeatures, StringWordFeatures, SortWordString from modshogun import DNA, PROTEIN, Labels from modshogun import WeightedDegreeStringKernel, CombinedKernel, WeightedCommWordStringKernel, WeightedDegreePositionStringKernel from modshogun import StringCharFeatures, DNA, StringWordFeatures, CombinedFeatures from modshogun import MSG_DEBUG from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet from modshogun import WeightedDegreeStringKernel, GaussianKernel try: from modshogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) from numpy import concatenate, ones from numpy.random import randn, seed import numpy import sys import types import random import bz2 import pickle import inspect ################################################### # Random Data ################################################### def generate_random_string(length, number): """ generate sample over alphabet """ dat = [] alphabet = "AGTC" for i in range(number): dat.append("".join([random.choice(alphabet) for j in range(length)])) return dat def generate_random_data(number): """ create random examples and labels """ labels = numpy.array([random.choice([-1.0, 1.0]) for i in range(number)]) examples = numpy.array(generate_random_string(22, number)) return examples, labels def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat def get_wd_features(data, feat_type="dna"): """ create feature object for wdk """ if feat_type == "dna": feat = StringCharFeatures(DNA) elif feat_type == "protein": feat = StringCharFeatures(PROTEIN) else: raise Exception("unknown feature type") feat.set_features(data) return feat def construct_features(features): """ makes a list """ feat_all = [inst for inst in features] feat_lhs = [inst[0:15] for inst in features] feat_rhs = [inst[15:] for inst in features] feat_wd = get_wd_features(feat_all) feat_spec_1 = get_spectrum_features(feat_lhs, order=3) feat_spec_2 = get_spectrum_features(feat_rhs, order=3) feat_comb = CombinedFeatures() feat_comb.append_feature_obj(feat_wd) feat_comb.append_feature_obj(feat_spec_1) feat_comb.append_feature_obj(feat_spec_2) return feat_comb parameter_list = [[200, 1, 100]] def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt); svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out,out2 if __name__=='__main__': serialization_string_kernels_modular(*parameter_list[0])
# This example shows how to use boost serialization (only available if the compile flag was enabled) # to serialize/deserialize an SVMLight object. Note that this code is in alpha state. #!/usr/bin/env python parameter_list=[[10, 1, 2.1, 2.0]] def serialization_svmlight_modular (num, dist, width, C): from modshogun import MSG_DEBUG from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet from modshogun import WeightedDegreeStringKernel, GaussianKernel try: from modshogun import SVMLight except ImportError: print("SVMLight not available") exit(0) from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 import pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab = concatenate((-ones(num), ones(num))); testlab = concatenate((-ones(num), ones(num))); feats_train = RealFeatures(traindata_real); feats_test = RealFeatures(testdata_real); kernel = GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab); svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing objectives") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return True if __name__=='__main__': print('Serialization SVMLight') serialization_svmlight_modular(*parameter_list[0])
#!/usr/bin/env python import numpy as np def gen_data(num_classes,num_samples,dim): np.random.seed(0) covs = np.array([[[0., -1. ], [2.5, .7]], [[3., -1.5], [1.2, .3]], [[ 2, 0 ], [ .0, 1.5 ]]]) X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]), np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]), np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])]; Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples))) return X, Y # Number of classes M = 3 # Number of samples of each class N = 50 # Dimension of the data dim = 2 traindat, label_traindat = gen_data(M,N,dim) parameter_list = [[traindat,label_traindat]] def so_multiclass (fm_train_real=traindat,label_train_multiclass=label_traindat): try: from modshogun import RealFeatures from modshogun import MulticlassModel, MulticlassSOLabels, PrimalMosekSOSVM, RealNumber except ImportError: print("Mosek not available") return labels = MulticlassSOLabels(label_train_multiclass) features = RealFeatures(fm_train_real.T) model = MulticlassModel(features, labels) sosvm = PrimalMosekSOSVM(model, labels) sosvm.train() out = sosvm.apply() count = 0 for i in xrange(out.get_num_labels()): yi_pred = RealNumber.obtain_from_generic(out.get_label(i)) if yi_pred.value == label_train_multiclass[i]: count = count + 1 print("Correct classification rate: %0.2f" % ( 100.0*count/out.get_num_labels() )) if __name__=='__main__': print('SO multiclass') so_multiclass(*parameter_list[0])
# In this example, HSIC, a kernel-based test for independence is used to detect # dependence of a mixture of Gaussians and a rotated version of the same data. # The HSIC statistic is computed and available methods for computing a threshold # of the null distribution are used. In addition, p-values of the test are # computed. Note that these methods require more iterations than used here. A # Gaussian kernel is selected via the median heuristic. # See tutorial and Class documentation for more details. #!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # import numpy as np from math import pi parameter_list = [[150,3,3]] def statistics_hsic (n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(np.array([data[0]])) features_y=RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=np.random.permutation(features_x.get_num_vectors()).astype(np.int32) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples=hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples if __name__=='__main__': print('HSIC') statistics_hsic(*parameter_list[0])
#!/usr/bin/env python from numpy import * from numpy import random parameter_list = [[10,3]] def statistics_kmm (n,d): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel, MSG_DEBUG try: from modshogun import KernelMeanMatching except ImportError: print("KernelMeanMatching not available") exit(0) from modshogun import Math # init seed for reproducability Math.init_random(1) random.seed(1); data = random.randn(d,n) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) kernel.init(features,features) kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32)) w = kmm.compute_weights() #print w return w if __name__=='__main__': print('KernelMeanMatching') statistics_kmm(*parameter_list[0])
# In this example, the linear time MMD statistic for kernel-based two-sample # testing is illustrated. It is a streaming based statistic for large amounts # of data. The used dataset is a bunch of standard Gaussian vectors where the # first dimensions differs in both distributions p and q. The test statistic # is computed and available methods for computing a threshold of the null # distribution are used. In addition, p-values for the test are computed. # Note that these methods require more iterations/samples that used here. A # Gaussian is selected via the median heuristic. There are more clever # kernel selection methods available. # See tutorial and Class documentation for more details. #!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # from numpy import * parameter_list = [[1000,2,0.5]] def statistics_linear_time_mmd (n,dim,difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel from modshogun import LinearTimeMMD from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width median_distance=median(distances) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # sampling null and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using sampling null" mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(10) # normally, far more iterations are needed null_samples=mmd.sample_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors if __name__=='__main__': print('LinearTimeMMD') statistics_linear_time_mmd(*parameter_list[0])
#!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # from numpy import * #from pylab import * parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "l2"]] def statistics_mmd_kernel_selection_combined(m,distance,stretch,num_blobs,angle,selection_method): from modshogun import RealFeatures from modshogun import GaussianBlobsDataGenerator from modshogun import GaussianKernel, CombinedKernel from modshogun import LinearTimeMMD try: from modshogun import MMDKernelSelectionCombMaxL2 except ImportError: print("MMDKernelSelectionCombMaxL2 not available") exit(0) try: from modshogun import MMDKernelSelectionCombOpt except ImportError: print("MMDKernelSelectionCombOpt not available") exit(0) from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=10000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # combined kernels if selection_method=="opt": selection=MMDKernelSelectionCombOpt(mmd) elif selection_method=="l2": selection=MMDKernelSelectionCombMaxL2(mmd) # perform kernel selection (kernel is automatically set) kernel=selection.select_kernel() kernel=CombinedKernel.obtain_from_generic(kernel) #print "selected kernel weights:", kernel.get_subkernel_weights() #subplot(2,2,3) #plot(kernel.get_subkernel_weights()) #title("Kernel weights") # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors if __name__=='__main__': print('MMDKernelSelectionCombined') statistics_mmd_kernel_selection_combined(*parameter_list[0]) #show()
#!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # from numpy import * #from pylab import * parameter_list = [[1000,10,5,3,pi/4, "opt"], [1000,10,5,3,pi/4, "max"], [1000,10,5,3,pi/4, "median"]] def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method): from modshogun import RealFeatures from modshogun import GaussianBlobsDataGenerator from modshogun import GaussianKernel, CombinedKernel from modshogun import LinearTimeMMD from modshogun import MMDKernelSelectionMedian from modshogun import MMDKernelSelectionMax from modshogun import MMDKernelSelectionOpt from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) m=1000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels if selection_method=="opt": selection=MMDKernelSelectionOpt(mmd) elif selection_method=="max": selection=MMDKernelSelectionMax(mmd) elif selection_method=="median": selection=MMDKernelSelectionMedian(mmd) # print measures (just for information) # in case Opt: ratios of MMD and standard deviation # in case Max: MMDs for each kernel # Does not work for median method if selection_method!="median": ratios=selection.compute_measures() #print "Measures:", ratios #subplot(2,2,3) #plot(ratios) #title('Measures') # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) #print "selected kernel width:", kernel.get_width() # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_kernel(kernel) mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors if __name__=='__main__': print('MMDKernelSelection') statistics_mmd_kernel_selection_single(*parameter_list[0]) #show()
# In this example, the quadratic time MMD statistic for kernel-based two-sample # testing is illustrated. It is a statistic for smaller amounts of data where # one is interested to compute the best possible test. The used dataset is a # bunch of standard Gaussian vectors where the first dimensions differs in both # distributions p and q. The test statistic is computed and available methods # for computing a threshold of the null distribution are used. In addition, # p-values for the test are computed. Note that these methods require more # iterations/samples that used here. A Gaussian is with a fixed kernel size is # used. There are more clever kernel selection methods available. # See tutorial and Class documentation for more details. #!/usr/bin/env python # # This program is free software you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation either version 3 of the License, or # (at your option) any later version. # # Written (C) 2012-2013 Heiko Strathmann # import numpy as np parameter_list = [[30,2,0.5]] def statistics_quadratic_time_mmd (m,dim,difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel, CustomKernel from modshogun import QuadraticTimeMMD from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED from modshogun import Statistics, IntVector, RealVector, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim); #gen_p.parallel.set_num_threads(1) gen_q=MeanShiftDataGenerator(difference, dim); # stream some data from generator feat_p=gen_p.get_streamed_features(m); feat_q=gen_q.get_streamed_features(m); # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width=10; kernel=GaussianKernel(10, width); # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd=QuadraticTimeMMD(kernel, feat_p, feat_q); # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha=0.05; # using permutation (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(3); p_value_null=mmd.perform_test(); # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_null<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(MMD2_SPECTRUM); mmd.set_num_eigenvalues_spectrum(3); mmd.set_num_samples_spectrum(250); p_value_spectrum=mmd.perform_test(); # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED_DEPRECATED statistic mmd.set_statistic_type(BIASED_DEPRECATED); mmd.set_null_approximation_method(MMD2_GAMMA); p_value_gamma=mmd.perform_test(); # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses permutation. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(5); num_trials=5; type_I_errors=np.zeros(num_trials) type_II_errors=np.zeros(num_trials) inds=np.array([x for x in range(2*m)], dtype=np.int32) p_and_q=mmd.get_p_and_q(); # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q); precomputed=CustomKernel(kernel); mmd.set_kernel(precomputed); for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds=np.random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds); precomputed.add_col_subset(inds); type_I_errors[i]=mmd.perform_test()>alpha; precomputed.remove_row_subset(); precomputed.remove_col_subset(); # on normal data, this gives type II error type_II_errors[i]=mmd.perform_test()>alpha; return type_I_errors,type_I_errors,p_value_null,p_value_spectrum,p_value_gamma, if __name__=='__main__': print('QuadraticTimeMMD') statistics_quadratic_time_mmd(*parameter_list[0])
#!/usr/bin/env python import numpy as np traindat = '../../../data/uci/housing/fm_housing.dat' label_traindat = '../../../data/uci/housing/housing_label.dat' # set both input attributes as nominal (True) / continuous (False) feat_types=np.array([False,False,False,True,False,False,False,False,False,False,False,False,False]) parameter_list = [[traindat,label_traindat,feat_types]] def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:num])) labels.add_subset(np.int32(p[0:num])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[num:len(p)])) labels.add_subset(np.int32(p[num:len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output if __name__=='__main__': print('StochasticGBMachine') stochasticgbmachine_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import StreamingVwFile from modshogun import StreamingVwCacheFile from modshogun import T_SVMLIGHT from modshogun import StreamingVwFeatures from modshogun import VowpalWabbit parameter_list=[['../data/fm_train_sparsereal.dat']] def streaming_vw_createcache_modular (fname): # First creates a binary cache from an ascii data file. # and then trains using the StreamingVwCacheFile as input # Open the input file as a StreamingVwFile input_file = StreamingVwFile(fname) # Default file name will be vw_cache.dat.cache input_file.set_write_to_cache(True) # Tell VW that the file is in SVMLight format # Supported types are T_DENSE, T_SVMLIGHT and T_VW input_file.set_parser_type(T_SVMLIGHT) ## Create a StreamingVwFeatures object, `True' indicating the examples are labelled #features = StreamingVwFeatures(input_file, True, 1024) ## Create a VW object from the features #vw = VowpalWabbit(features) #vw.set_no_training(True) ## Train (in this case does nothing but run over all examples) #vw.train() ##Finally Train using the generated cache file ## Open the input cache file as a StreamingVwCacheFile #input_file = StreamingVwCacheFile("vw_cache.dat.cache"); ## The rest is exactly as for normal input #features = StreamingVwFeatures(input_file, True, 1024); #vw = VowpalWabbit(features) #vw.train() ##return vw if __name__ == "__main__": streaming_vw_createcache_modular(*parameter_list[0])
#!/usr/bin/env python from modshogun import StreamingVwFile from modshogun import T_SVMLIGHT from modshogun import StreamingVwFeatures from modshogun import VowpalWabbit parameter_list=[[None]] def streaming_vw_modular (dummy): """Runs the VW algorithm on a toy dataset in SVMLight format.""" # Open the input file as a StreamingVwFile input_file = StreamingVwFile("../data/fm_train_sparsereal.dat") # Tell VW that the file is in SVMLight format # Supported types are T_DENSE, T_SVMLIGHT and T_VW input_file.set_parser_type(T_SVMLIGHT) ## Create a StreamingVwFeatures object, `True' indicating the examples are labelled #features = StreamingVwFeatures(input_file, True, 1024) ## Create a VW object from the features #vw = VowpalWabbit(features) ## Train #vw.train() ##return vw if __name__ == "__main__": streaming_vw_modular(*parameter_list[0])
#!/usr/bin/env python import numpy import scipy from scipy import io data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False) parameter_list=[[data_dict]] def structure_discrete_hmsvm_bmrm (m_data_dict=data_dict): from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel from modshogun import StructuredAccuracy, SMT_TWO_STATE try: from modshogun import DualLibQPBMSOSVM except ImportError: print("DualLibQPBMSOSVM not available") exit(0) labels_array = m_data_dict['label'][0] idxs = numpy.nonzero(labels_array == -1) labels_array[idxs] = 0 labels = SequenceLabels(labels_array, 250, 500, 2) features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500) num_obs = 4 # given by the data file used model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs) sosvm = DualLibQPBMSOSVM(model, labels, 5000.0) sosvm.train() #print sosvm.get_w() predicted = sosvm.apply(features) evaluator = StructuredAccuracy() acc = evaluator.evaluate(predicted, labels) #print('Accuracy = %.4f' % acc) if __name__ == '__main__': print("Discrete HMSVM BMRM") structure_discrete_hmsvm_bmrm(*parameter_list[0])
#!/usr/bin/env python import numpy import scipy from scipy import io data_dict = scipy.io.loadmat('../data/hmsvm_data_large_integer.mat', struct_as_record=False) parameter_list=[[data_dict]] def structure_discrete_hmsvm_mosek (m_data_dict=data_dict): from modshogun import RealMatrixFeatures, SequenceLabels, HMSVMModel, Sequence, TwoStateModel from modshogun import StructuredAccuracy, SMT_TWO_STATE try: from modshogun import PrimalMosekSOSVM except ImportError: print("Mosek not available") return labels_array = m_data_dict['label'][0] idxs = numpy.nonzero(labels_array == -1) labels_array[idxs] = 0 labels = SequenceLabels(labels_array, 250, 500, 2) features = RealMatrixFeatures(m_data_dict['signal'].astype(float), 250, 500) num_obs = 4 # given by the data file used model = HMSVMModel(features, labels, SMT_TWO_STATE, num_obs) sosvm = PrimalMosekSOSVM(model, labels) sosvm.train() #print(sosvm.get_w()) predicted = sosvm.apply() evaluator = StructuredAccuracy() acc = evaluator.evaluate(predicted, labels) #print('Accuracy = %.4f' % acc) if __name__ == '__main__': print("Discrete HMSVM Mosek") structure_discrete_hmsvm_mosek(*parameter_list[0])
# In this example we use the dynamic progaramm implementation with a # gene finding specific model. The model and the training parameter # are stored in a file and are used to create a gene prediction on # some example sequence. #!/usr/bin/env python #!/usr/bin/env python # -*- coding: utf-8 -*- parameter_list=[['../data/DynProg_example_py.pickle.gz']] from modshogun import * import numpy from numpy import array,Inf,float64,matrix,frompyfunc,zeros #from IPython.Shell import IPShellEmbed #ipshell = IPShellEmbed() import gzip import scipy from scipy.io import loadmat import pickle try: from StringIO import StringIO except ImportError: from io import BytesIO as StringIO def get_ver(ver_str): scipy_ver=[int(i) for i in scipy.__version__.split('.')] v=0 for i in range(len(scipy_ver)): v+=10**(len(scipy_ver)-i)*scipy_ver[i] return v if get_ver(scipy.__version__) >= get_ver('0.7.0'): renametable = { 'scipy.io.mio5': 'scipy.io.matlab.mio5', 'scipy.sparse.sparse' : 'scipy.sparse', } else: renametable = {} def mapname(name): if name in renametable: return renametable[name] return name # scipy compatibility class class mat_struct(object): pass def mapped_load_global(self): module = mapname(self.readline()[:-1]) name = mapname(self.readline()[:-1]) if name=='mat_struct': klass=mat_struct else: klass = self.find_class(module, name) self.append(klass) def loads(str): file = StringIO(str) unpickler = pickle.Unpickler(file) unpickler.dispatch[pickle.GLOBAL] = mapped_load_global return unpickler.load() def structure_dynprog_modular (fname): import sys #pickle is not compatible between python2 -> 3 if sys.version_info[0]>2: return data_dict = loads(gzip.GzipFile(fname).read()) #data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False) #print(data_dict) #print(len(data_dict['penalty_array'][0][0][0][0].limits[0])) num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits) pm = PlifMatrix() pm.create_plifs(num_plifs,num_limits) ids = numpy.array(list(range(num_plifs)),dtype=numpy.int32) min_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64) max_values = numpy.array(list(range(num_plifs)),dtype=numpy.float64) all_use_cache = numpy.array(list(range(num_plifs)),dtype=numpy.bool) all_use_svm = numpy.array(list(range(num_plifs)),dtype=numpy.int32) all_limits = zeros((num_plifs,num_limits)) all_penalties = zeros((num_plifs,num_limits)) all_names = ['']*num_plifs all_transforms = ['']*num_plifs for plif_idx in range(num_plifs): ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1 min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name) all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform) if all_transforms[plif_idx] == '[]': all_transforms[plif_idx] = 'linear' pm.set_plif_ids(ids) pm.set_plif_min_values(min_values) pm.set_plif_max_values(max_values) pm.set_plif_use_cache(all_use_cache) pm.set_plif_use_svm(all_use_svm) pm.set_plif_limits(all_limits) pm.set_plif_penalties(all_penalties) #pm.set_plif_names(all_names) #pm.set_plif_transform_type(all_transforms) transition_ptrs = data_dict['model'].transition_pointers transition_ptrs = transition_ptrs[:,:,0:2] transition_ptrs = transition_ptrs.astype(numpy.float64) pm.compute_plif_matrix(transition_ptrs) # init_dyn_prog num_svms = 8 dyn = DynProg(num_svms) orf_info = data_dict['model'].orf_info orf_info = orf_info.astype(numpy.int32) num_states = orf_info.shape[0] dyn.set_num_states(num_states) block = data_dict['block'] seq_len = len(block.seq) seq = str(block.seq) gene_string = array([elem for elem in seq]) # precompute_content_svms pos = block.all_pos-1 pos = pos.astype(numpy.int32) snd_pos = pos dyn.set_pos(pos) dyn.set_gene_string(gene_string) dyn.create_word_string() dyn.precompute_stop_codons() dyn.init_content_svm_value_array(num_svms) dict_weights = data_dict['content_weights'] dict_weights = dict_weights.reshape(8,1).astype(numpy.float64) dict_weights = zeros((8,5440)) dyn.set_dict_weights(dict_weights.T) dyn.precompute_content_values() dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32)) pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32)) dyn.set_orf_info(orf_info) # p = data_dict['model'].p q = data_dict['model'].q dyn.set_p_vector(p) dyn.set_q_vector(q) a_trans = data_dict['a_trans'] a_trans = a_trans.astype(float64) dyn.set_a_trans_matrix(a_trans) dyn.check_svm_arrays() features = data_dict['block'].features dyn.set_observation_matrix(features) dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64)) dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64)) use_orf = True feat_dims = [25,201,2] dyn.set_plif_matrices(pm); #dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False) ## fetch results #states = dyn.get_states() ##print(states) #scores = dyn.get_scores() ##print(scores) #positions = dyn.get_positions() ##print(positions) #return states, scores, positions if __name__ == '__main__': print("Structure") structure_dynprog_modular(*parameter_list[0])
#!/usr/bin/env python import numpy as np from modshogun import TableFactorType # create the factor type with GT parameters tid = 0 cards = np.array([2,2], np.int32) w_gt = np.array([0.3,0.5,1.0,0.2,0.05,0.6,-0.2,0.75]) fac_type = TableFactorType(tid, cards, w_gt) tid_u = 1 cards_u = np.array([2], np.int32) w_gt_u = np.array([0.5,0.8,1.0,-0.3]) fac_type_u = TableFactorType(tid_u, cards_u, w_gt_u) tid_b = 2 cards_b = np.array([2], np.int32) w_gt_b = np.array([0.8, -0.8]) fac_type_b = TableFactorType(tid_b, cards_b, w_gt_b) def gen_data(ftype, num_samples, show_data = False): from modshogun import Math from modshogun import FactorType, Factor, TableFactorType, FactorGraph from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures from modshogun import MAPInference, TREE_MAX_PROD Math.init_random(17) samples = FactorGraphFeatures(num_samples) labels = FactorGraphLabels(num_samples) for i in range(num_samples): vc = np.array([2,2,2], np.int32) fg = FactorGraph(vc) data1 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)]) vind1 = np.array([0,1], np.int32) fac1 = Factor(ftype[0], vind1, data1) fg.add_factor(fac1) data2 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)]) vind2 = np.array([1,2], np.int32) fac2 = Factor(ftype[0], vind2, data2) fg.add_factor(fac2) data3 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)]) vind3 = np.array([0], np.int32) fac3 = Factor(ftype[1], vind3, data3) fg.add_factor(fac3) data4 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)]) vind4 = np.array([1], np.int32) fac4 = Factor(ftype[1], vind4, data4) fg.add_factor(fac4) data5 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in range(2)]) vind5 = np.array([2], np.int32) fac5 = Factor(ftype[1], vind5, data5) fg.add_factor(fac5) data6 = np.array([1.0]) vind6 = np.array([0], np.int32) fac6 = Factor(ftype[2], vind6, data6) fg.add_factor(fac6) data7 = np.array([1.0]) vind7 = np.array([2], np.int32) fac7 = Factor(ftype[2], vind7, data7) fg.add_factor(fac7) samples.add_sample(fg) fg.connect_components() fg.compute_energies() infer_met = MAPInference(fg, TREE_MAX_PROD) infer_met.inference() fg_obs = infer_met.get_structured_outputs() labels.add_label(fg_obs) if show_data: state = fg_obs.get_data() print(state) return samples, labels w_all = [w_gt,w_gt_u,w_gt_b] ftype_all = [fac_type,fac_type_u,fac_type_b] num_samples = 10 samples, labels = gen_data(ftype_all, num_samples) parameter_list = [[samples,labels,w_all,ftype_all]] def structure_factor_graph_model(tr_samples = samples, tr_labels = labels, w = w_all, ftype = ftype_all): from modshogun import SOSVMHelper, LabelsFactory from modshogun import FactorGraphModel, MAPInference, TREE_MAX_PROD from modshogun import StochasticSOSVM, FWSOSVM try: from modshogun import DualLibQPBMSOSVM except ImportError: print("DualLibQPBMSOSVM not available") exit(0) # create model model = FactorGraphModel(tr_samples, tr_labels, TREE_MAX_PROD, False) w_truth = [w[0].copy(), w[1].copy(), w[2].copy()] w[0] = np.zeros(8) w[1] = np.zeros(4) w[2] = np.zeros(2) ftype[0].set_w(w[0]) ftype[1].set_w(w[1]) ftype[2].set_w(w[2]) model.add_factor_type(ftype[0]) model.add_factor_type(ftype[1]) model.add_factor_type(ftype[2]) # --- training with BMRM --- bmrm = DualLibQPBMSOSVM(model, tr_labels, 0.01) #bmrm.set_verbose(True) bmrm.train() #print 'learned weights:' #print bmrm.get_w() #print 'ground truth weights:' #print w_truth # evaluation lbs_bmrm = bmrm.apply() acc_loss = 0.0 ave_loss = 0.0 for i in range(num_samples): y_pred = lbs_bmrm.get_label(i) y_truth = tr_labels.get_label(i) acc_loss = acc_loss + model.delta_loss(y_truth, y_pred) ave_loss = acc_loss / num_samples #print('BMRM: Average training error is %.4f' % ave_loss) # show primal objs and dual objs #hbm = bmrm.get_helper() #print hbm.get_primal_values() #print hbm.get_eff_passes() #print hbm.get_train_errors() # --- training with SGD --- sgd = StochasticSOSVM(model, tr_labels) #sgd.set_verbose(True) sgd.set_lambda(0.01) sgd.train() # evaluation #print('SGD: Average training error is %.4f' % SOSVMHelper.average_loss(sgd.get_w(), model)) #hp = sgd.get_helper() #print hp.get_primal_values() #print hp.get_eff_passes() #print hp.get_train_errors() # --- training with FW --- fw = FWSOSVM(model, tr_labels) #fw.set_verbose(True) fw.set_lambda(0.01) fw.set_gap_threshold(0.01) fw.train() # evaluation #print('FW: Average training error is %.4f' % SOSVMHelper.average_loss(fw.get_w(), model)) #hp = fw.get_helper() #print hp.get_primal_values() #print hp.get_dual_values() #print hp.get_eff_passes() #print hp.get_train_errors() if __name__ == '__main__': print("Factor Graph Model") structure_factor_graph_model(*parameter_list[0])
#!/usr/bin/env python import numpy as np import itertools from modshogun import Factor, TableFactorType, FactorGraph from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures from modshogun import FactorGraphModel, GRAPH_CUT from modshogun import GraphCut from modshogun import StochasticSOSVM def generate_data(num_train_samples, len_label, len_feat): """ Generate synthetic dataset Generate random data following [1]: Each example has exactly one label on. Each label has 40 related binary features. For an example, if label i is on, 4i randomly chosen features are set to 1 [1] Finley, Thomas, and Thorsten Joachims. "Training structural SVMs when exact inference is intractable." Proceedings of the 25th international conference on Machine learning. ACM, 2008. Args: num_train_samples: number of samples len_label: label length (10) len_feat: feature length (40) Returns: feats: generated feature matrix labels: generated label matrix """ labels = np.zeros((num_train_samples, len_label), np.int32) feats = np.zeros((num_train_samples, len_feat), np.int32) for k in range(num_train_samples): i = k % len_label labels[k, i] = 1 inds_one = np.random.permutation(range(len_feat)) inds_one = inds_one[:4*(i+1)] for j in inds_one: feats[k, j] = 1 return (labels, feats) def define_factor_types(num_vars, len_feat, edge_table): """ Define factor types Args: num_vars: number of variables in factor graph len_feat: length of the feature vector edge_table: edge table defines pair-wise node indeces Returns: v_factor_types: list of all unary and pair-wise factor types """ n_stats = 2 # for binary status v_factor_types = {} n_edges = edge_table.shape[0] # unary factors cards_u = np.array([n_stats], np.int32) w_u = np.zeros(n_stats*len_feat) for i in range(num_vars): v_factor_types[i] = TableFactorType(i, cards_u, w_u) # pair-wise factors cards_pw = np.array([n_stats, n_stats], np.int32) w_pw = np.zeros(n_stats*n_stats) for j in range(n_edges): v_factor_types[j + num_vars] = TableFactorType(j + num_vars, cards_pw, w_pw) return v_factor_types def build_factor_graph_model(labels, feats, factor_types, edge_table, infer_alg = GRAPH_CUT): """ Build factor graph model Args: labels: matrix of labels [num_train_samples*len_label] feats: maxtrix of feats [num_train_samples*len_feat] factory_types: vectors of all factor types edge_table: matrix of pairwised edges, each row is a pair of node indeces infer_alg: inference algorithm (GRAPH_CUT) Returns: labels_fg: matrix of labels in factor graph format feats_fg: matrix of features in factor graph format """ labels = labels.astype(np.int32) num_train_samples = labels.shape[0] num_vars = labels.shape[1] num_edges = edge_table.shape[0] n_stats = 2 feats_fg = FactorGraphFeatures(num_train_samples) labels_fg = FactorGraphLabels(num_train_samples) for i in range(num_train_samples): cardinaities = np.array([n_stats]*num_vars, np.int32) fg = FactorGraph(cardinaities) # add unary factors for u in range(num_vars): data_u = np.array(feats[i,:], np.float64) inds_u = np.array([u], np.int32) factor_u = Factor(factor_types[u], inds_u, data_u) fg.add_factor(factor_u) # add pairwise factors for v in range(num_edges): data_p = np.array([1.0]) inds_p = np.array(edge_table[v, :], np.int32) factor_p = Factor(factor_types[v + num_vars], inds_p, data_p) fg.add_factor(factor_p) # add factor graph feats_fg.add_sample(fg) # add corresponding label loss_weights = np.array([1.0/num_vars]*num_vars) fg_obs = FactorGraphObservation(labels[i,:], loss_weights) labels_fg.add_label(fg_obs) return (labels_fg, feats_fg) def evaluation(labels_pr, labels_gt, model): """ Evaluation Args: labels_pr: predicted label labels_gt: ground truth label model: factor graph model Returns: ave_loss: average loss """ num_train_samples = labels_pr.get_num_labels() acc_loss = 0.0 ave_loss = 0.0 for i in range(num_train_samples): y_pred = labels_pr.get_label(i) y_truth = labels_gt.get_label(i) acc_loss = acc_loss + model.delta_loss(y_truth, y_pred) ave_loss = acc_loss / num_train_samples return ave_loss def graphcuts_sosvm(num_train_samples = 10, len_label = 5, len_feat = 20, num_test_samples = 5): """ Graph cuts as approximate inference in structured output SVM framework. Args: num_train_samples: number of training samples len_label: number of classes, i.e., size of label space len_feat: the dimension of the feature vector num_test_samples: number of testing samples """ import time # generate synthetic dataset (labels_train, feats_train) = generate_data(num_train_samples, len_label, len_feat) # compute full-connected edge table full = np.vstack([x for x in itertools.combinations(range(len_label), 2)]) # define factor types factor_types = define_factor_types(len_label, len_feat, full) # create features and labels for factor graph mode (labels_fg, feats_fg) = build_factor_graph_model(labels_train, feats_train, factor_types, full, GRAPH_CUT) # create model and register factor types model = FactorGraphModel(feats_fg, labels_fg, GRAPH_CUT) for i in range(len(factor_types)): model.add_factor_type(factor_types[i]) # Training # the 3rd parameter is do_weighted_averaging, by turning this on, # a possibly faster convergence rate may be achieved. # the 4th parameter controls outputs of verbose training information sgd = StochasticSOSVM(model, labels_fg, True, True) sgd.set_num_iter(150) sgd.set_lambda(0.0001) # train t0 = time.time() sgd.train() t1 = time.time() w_sgd = sgd.get_w() #print "SGD took", t1 - t0, "seconds." # training error labels_pr = sgd.apply() ave_loss = evaluation(labels_pr, labels_fg, model) #print('SGD: Average training error is %.4f' % ave_loss) # testing error # generate synthetic testing dataset (labels_test, feats_test) = generate_data(num_test_samples, len_label, len_feat) # create features and labels for factor graph mode (labels_fg_test, feats_fg_test) = build_factor_graph_model(labels_test, feats_test, factor_types, full, GRAPH_CUT) # set features and labels to sgd sgd.set_features(feats_fg_test) sgd.set_labels(labels_fg_test) # test labels_pr = sgd.apply() ave_loss = evaluation(labels_pr, labels_fg_test, model) #print('SGD: Average testing error is %.4f' % ave_loss) def graphcuts_general(): """ Graph cuts for general s-t graph optimization. """ num_nodes = 5 num_edges = 6 g = GraphCut(num_nodes, num_edges) # add termainal-connected edges # i.e., SOURCE->node_i and node_i->SINK g.add_tweights(0, 4, 0) g.add_tweights(1, 2, 0) g.add_tweights(2, 8, 0) g.add_tweights(2, 0, 4) g.add_tweights(3, 0, 7) g.add_tweights(4, 0, 5) # add node to node edges g.add_edge(0, 2, 5, 0) g.add_edge(0, 3, 2, 0) g.add_edge(1, 2, 6, 0) g.add_edge(1, 4, 9, 0) g.add_edge(2, 3, 1, 0) g.add_edge(2, 4, 3, 0) # initialize max-flow algorithm g.init_maxflow() # compute max flow flow = g.compute_maxflow() #print("Flow = %f" % flow) # print assignment #for i in xrange(num_nodes): # print("\nNode %d = %d" % (i, g.get_assignment(i))) test_general = True test_sosvm = True parameter_list = [[test_general, test_sosvm]] def structure_graphcuts(test_general=True, test_sosvm=True): """ Test graph cuts. Args: test_general: test graph cuts for general s-t graph optimization test_sosvm: test graph cuts for structured output svm """ if test_general: graphcuts_general() if test_sosvm: graphcuts_sosvm() if __name__ == '__main__': print("Graph cuts") structure_graphcuts(*parameter_list[0])
#!/usr/bin/env python """ This examples shows how to use HierarchicalMultilabelModel for hierarchical multi-label classification. The data used: [1] Image CLEF 2007 competition for annotation of X-Ray images. http://kt.ijs.si/DragiKocev/PhD/resources/doku.php?id=hmc_classification#imageclef07d """ from modshogun import MultilabelSOLabels, HierarchicalMultilabelModel from modshogun import RealFeatures from modshogun import StochasticSOSVM from modshogun import StructuredAccuracy, LabelsFactory import numpy as np import time train_file_name = '../../../data/multilabel/image_clef_train.arff' test_file_name = '../../../data/multilabel/image_clef_test.arff' parameter_list = [[train_file_name, test_file_name]] def get_taxonomy(labels): """ Converting the labels to shogun compatible format (i.e. 0, 1, ... num_classes - 1) and getting taxonomy of the labels """ labels = labels.split(',') num_labels = len(labels) # taking the root label into consideration num_labels += 1 shogun_labels = dict() taxonomy = np.zeros(num_labels, dtype=np.int32) # considering the root_label node index to be 0 taxonomy[0] = -1 for i, label in enumerate(labels): shogun_labels[label] = i + 1 try: parent_label = label[:-2] parent_idx = labels.index(parent_label) + 1 taxonomy[i + 1] = parent_idx except ValueError: taxonomy[i + 1] = 0 return shogun_labels, taxonomy def get_data_sample(data_sample, shogun_labels): """ Extracting features and labels from a single row of data """ data = data_sample.split(',') features = np.array(data[:-1], dtype=np.float64) labs = data[-1].split('@') # adding the root label labels = np.zeros(len(labs) + 1, dtype=np.int32) labels[0] = 0 for i, label in enumerate(labs): labels[i + 1] = shogun_labels[label] labels.sort() return features, labels def get_data(data, shogun_labels): """ Creating features and labels from the data samples """ num_samples = len(data) # considering the root label num_classes = len(shogun_labels) + 1 labels = MultilabelSOLabels(num_samples, num_classes) for i, data_sample in enumerate(data): feats, labs = get_data_sample(data_sample, shogun_labels) try: features = np.c_[features, feats] except NameError: features = feats labels.set_sparse_label(i, labs) return RealFeatures(features), labels def get_features_labels(input_file): """ Creating features and labels from the input file (train/test file) """ train_file_lines = list(map(lambda x: x.strip(), input_file.readlines())) all_labels = list(filter(lambda x: 'hierarchical' in x.strip(), train_file_lines))[0].split()[-1] shogun_labels, taxonomy = get_taxonomy(all_labels) data_index = train_file_lines.index('@DATA') features, labels = get_data(train_file_lines[data_index + 1:], shogun_labels) return features, labels, taxonomy def structure_hierarchical_multilabel_classification(train_file_name, test_file_name): train_file = open(train_file_name) test_file = open(test_file_name) train_features, train_labels, train_taxonomy = get_features_labels( train_file) model = HierarchicalMultilabelModel(train_features, train_labels, train_taxonomy) sgd = StochasticSOSVM(model, train_labels) t1 = time.time() sgd.train() print('>>> Took %f time for training' % (time.time() - t1)) test_features, test_labels, test_taxonomy = get_features_labels(test_file) assert(test_taxonomy.all() == train_taxonomy.all()) evaluator = StructuredAccuracy() outlabel = LabelsFactory.to_structured(sgd.apply(test_features)) print('>>> Accuracy of classification = %f' % evaluator.evaluate( outlabel, test_labels)) if __name__ == '__main__': print('Hierarchical Multilabel Classification') structure_hierarchical_multilabel_classification(*parameter_list[0])
#!/usr/bin/env python import numpy as np def gen_data(num_classes,num_samples,dim): np.random.seed(0) covs = np.array([[[0., -1. ], [2.5, .7]], [[3., -1.5], [1.2, .3]], [[ 2, 0 ], [ .0, 1.5 ]]]) X = np.r_[np.dot(np.random.randn(num_samples, dim), covs[0]) + np.array([0, 10]), np.dot(np.random.randn(num_samples, dim), covs[1]) + np.array([-10, -10]), np.dot(np.random.randn(num_samples, dim), covs[2]) + np.array([10, -10])]; Y = np.hstack((np.zeros(num_samples), np.ones(num_samples), 2*np.ones(num_samples))) return X, Y # Number of classes M = 3 # Number of samples of each class N = 50 # Dimension of the data dim = 2 traindat, label_traindat = gen_data(M,N,dim) parameter_list = [[traindat,label_traindat]] def structure_multiclass_bmrm(fm_train_real=traindat,label_train_multiclass=label_traindat): from modshogun import MulticlassSOLabels, LabelsFactory from modshogun import RealFeatures from modshogun import SOSVMHelper try: from modshogun import BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM except ImportError: print("At least one of BMRM, PPBMRM, P3BMRM, DualLibQPBMSOSVM not available") exit(0) from modshogun import MulticlassModel, RealNumber labels = MulticlassSOLabels(label_train_multiclass) features = RealFeatures(fm_train_real.T) model = MulticlassModel(features, labels) sosvm = DualLibQPBMSOSVM(model, labels, 1.0) # BMRM sosvm.set_solver(BMRM) sosvm.set_verbose(True) sosvm.train() bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply()) count = 0 for i in range(bmrm_out.get_num_labels()): yi_pred = RealNumber.obtain_from_generic(bmrm_out.get_label(i)) if yi_pred.value == label_train_multiclass[i]: count = count + 1 #print("BMRM: Correct classification rate: %0.2f" % ( 100.0*count/bmrm_out.get_num_labels() )) #hp = sosvm.get_helper() #print hp.get_primal_values() #print hp.get_train_errors() # PPBMRM w = np.zeros(model.get_dim()) sosvm.set_w(w) sosvm.set_solver(PPBMRM) sosvm.set_verbose(True) sosvm.train() ppbmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply()) count = 0 for i in range(ppbmrm_out.get_num_labels()): yi_pred = RealNumber.obtain_from_generic(ppbmrm_out.get_label(i)) if yi_pred.value == label_train_multiclass[i]: count = count + 1 #print("PPBMRM: Correct classification rate: %0.2f" % ( 100.0*count/ppbmrm_out.get_num_labels() )) # P3BMRM w = np.zeros(model.get_dim()) sosvm.set_w(w) sosvm.set_solver(P3BMRM) sosvm.set_verbose(True) sosvm.train() p3bmrm_out = LabelsFactory.to_multiclass_structured(sosvm.apply()) count = 0 for i in range(p3bmrm_out.get_num_labels()): yi_pred = RealNumber.obtain_from_generic(p3bmrm_out.get_label(i)) if yi_pred.value == label_train_multiclass[i]: count = count + 1 #print("P3BMRM: Correct classification rate: %0.2f" % ( 100.0*count/p3bmrm_out.get_num_labels() )) return bmrm_out, ppbmrm_out, p3bmrm_out if __name__=='__main__': print('SO multiclass model with bundle methods') a,b,c=structure_multiclass_bmrm(*parameter_list[0])
#!/usr/bin/env python parameter_list=[[50, 125, 10, 2]] def structure_plif_hmsvm_bmrm (num_examples, example_length, num_features, num_noise_features): from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy try: from modshogun import DualLibQPBMSOSVM except ImportError: print("DualLibQPBMSOSVM not available") exit(0) model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features) sosvm = DualLibQPBMSOSVM(model, model.get_labels(), 5000.0) sosvm.set_store_train_info(False) sosvm.train() #print sosvm.get_w() predicted = sosvm.apply(model.get_features()) evaluator = StructuredAccuracy() acc = evaluator.evaluate(predicted, model.get_labels()) #print('Accuracy = %.4f' % acc) if __name__ == '__main__': print("PLiF HMSVM BMRM") structure_plif_hmsvm_bmrm(*parameter_list[0])
#!/usr/bin/env python parameter_list=[[100, 250, 10, 2]] def structure_plif_hmsvm_mosek (num_examples, example_length, num_features, num_noise_features): from modshogun import RealMatrixFeatures, TwoStateModel, StructuredAccuracy try: from modshogun import PrimalMosekSOSVM except ImportError: print("Mosek not available") return model = TwoStateModel.simulate_data(num_examples, example_length, num_features, num_noise_features) sosvm = PrimalMosekSOSVM(model, model.get_labels()) sosvm.train() #print(sosvm.get_w()) predicted = sosvm.apply(model.get_features()) evaluator = StructuredAccuracy() acc = evaluator.evaluate(predicted, model.get_labels()) #print('Accuracy = %.4f' % acc) if __name__ == '__main__': print("PLiF HMSVM Mosek") structure_plif_hmsvm_mosek(*parameter_list[0])
#!/usr/bin/env python parameter_list=[[10,7,0,False]] def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse): import gc from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA from modshogun import SortWordString, MSG_DEBUG from modshogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in range(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K if __name__=='__main__': print('Leak Check Comm Word Kernel') tests_check_commwordkernel_memleak_modular(*parameter_list[0])
#!/usr/bin/env python from numpy import array,hstack,sin,cos from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from modshogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat)))) labels = BinaryLabels(hstack((label_train,label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//3) task_two = Task(n_vectors//3,2*n_vectors//3) task_three = Task(2*n_vectors//3,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out if __name__=='__main__': print('TransferMultitaskClusteredLogisticRegression') transfer_multitask_clustered_logistic_regression(*parameter_list[0])
#!/usr/bin/env python from numpy import array,hstack from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskL12LogisticRegression except ImportError: print("MultitaskL12LogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out if __name__=='__main__': print('TransferMultitaskL12LogisticRegression') transfer_multitask_l12_logistic_regression(*parameter_list[0])
#!/usr/bin/env python from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out if __name__=='__main__': print('TransferMultitaskLeastSquaresRegression') transfer_multitask_leastsquares_regression(*parameter_list[0])
#!/usr/bin/env python from numpy import array,hstack from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskLogisticRegression except ImportError: print("MultitaskLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out if __name__=='__main__': print('TransferMultitaskLogisticRegression') transfer_multitask_logistic_regression(*parameter_list[0])
#!/usr/bin/env python from numpy import array,hstack from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat]] def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskTraceLogisticRegression except ImportError: print("MultitaskTraceLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out if __name__=='__main__': print('TransferMultitaskTraceLogisticRegression') transfer_multitask_trace_logistic_regression(*parameter_list[0])
#!/usr/bin/env python # # Copyright (c) The Shogun Machine Learning Toolbox # Written (w) 2014 Wu Lin # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are those # of the authors and should not be interpreted as representing official policies, # either expressed or implied, of the Shogun Development Team. # # path='../data' traindat = '%s/fm_train_real.dat'%path testdat = '%s/fm_test_real.dat'%path label_binary_traindat = '%s/label_train_twoclass.dat'%path try: from modshogun import GaussianProcessClassification except ImportError: print("GaussianProcessClassification is not available") exit(0) from modshogun import * parameter_list=[ [KLCholeskyInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0], [KLCovarianceInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0], [KLDiagonalInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0], [KLDualInferenceMethod,traindat,testdat,label_binary_traindat,0,0,1e-5,1e-2,0], [SingleLaplaceInferenceMethod,traindat,testdat,label_binary_traindat,0,0], ] def variational_classifier_modular(kl_inference,train_fname=traindat,test_fname=testdat, label_fname=label_binary_traindat,kernel_log_sigma=0,kernel_log_scale=0,noise_factor=1e-5, min_coeff_kernel=1e-2,max_attempt=0): from math import exp features_train=RealFeatures(CSVFile(train_fname)) labels_train=BinaryLabels(CSVFile(label_fname)) likelihood=LogitDVGLikelihood() error_eval=ErrorRateMeasure() mean_func=ConstMean() kernel_sigma=2*exp(2*kernel_log_sigma); kernel_func=GaussianKernel(10, kernel_sigma) inf=kl_inference(kernel_func, features_train, mean_func, labels_train, likelihood) try: inf.set_noise_factor(noise_factor) inf.set_min_coeff_kernel(min_coeff_kernel) inf.set_max_attempt(max_attempt) except: pass inf.set_scale(exp(kernel_log_scale)) gp=GaussianProcessClassification(inf) gp.train() pred_labels_train=gp.apply_binary(features_train) error_train=error_eval.evaluate(pred_labels_train, labels_train) #print "\nInference name:%s"%inf.get_name(), #print "marginal likelihood:%.10f"%inf.get_negative_log_marginal_likelihood(), #print "Training error %.4f"%error_train return pred_labels_train, gp, pred_labels_train.get_labels() if __name__=="__main__": print("variational_classifier") for parameter in parameter_list: variational_classifier_modular(*parameter)