SHOGUN
4.2.0
|
This page lists ready to run shogun examples for the Static Python interface.
To run the examples issue
python name_of_example.py
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm Gradient Projection Decomposition Technique # (GPDT) is used with SVM regularization parameter C=1.2 and a Gaussian # kernel of width 2.1 and 10MB of kernel cache. # # For more details on GPDT solver see http://dm.unife.it/gpdt # # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_gpbtsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'GPBTSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') return result if __name__=='__main__': print('GPBTSVM') classifier_gpbtsvm(*parameter_list[0])
# This example shows usage of a k-nearest neighbor (KNN) classification rule on # a toy data set. The number of the nearest neighbors is set to k=3 and the distances # are measured by the Euclidean metric. Finally, the KNN rule is applied to predict # labels of test examples. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[[traindat,testdat, train_label,3], [traindat,testdat,train_label,4]] def classifier_knn (fm_train_real=traindat,fm_test_real=testdat, label_train_multiclass=train_label,k=3): sg('set_features', 'TRAIN', fm_train_real) sg('set_labels', 'TRAIN', label_train_multiclass) sg('set_distance', 'EUCLIDEAN', 'REAL') sg('new_classifier', 'KNN') sg('train_classifier', k) sg('set_features', 'TEST', fm_test_real) result=sg('classify') return result if __name__=='__main__': print('KNN') classifier_knn(*parameter_list[0])
# In this example a linear two-class classifier is trained based on the Linear # Discriminant Analysis (LDA) from a toy 2-dimensional examples. The trained # LDA classifier is used to predict test examples. Note that the LDA classifier # is optimal under the assumption that both classes are Gaussian distributed with equal # co-variance. For more details on the LDA see e.g. # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label], [traindat,testdat,train_label]] def classifier_lda (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label): sg('set_features', 'TRAIN', fm_train_real) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'LDA') sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') return result if __name__=='__main__': print('LDA') classifier_lda(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm LIBSVM is used with SVM regularization # parameter C=1 and a Gaussian kernel of width 1.2 and 10MB of kernel cache and # the precision parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_libsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'LIBSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('LibSVM') classifier_libsvm(*parameter_list[0])
# In this example a one-class support vector machine classifier is trained on a # toy data set. The training algorithm finds a hyperplane in the RKHS which # separates the training data from the origin. The one-class classifier is # typically used to estimate the support of a high-dimesnional distribution. # For more details see e.g. # B. Schoelkopf et al. Estimating the support of a high-dimensional # distribution. Neural Computation, 13, 2001, 1443-1471. # # In the example, the one-class SVM is trained by the LIBSVM solver with the # regularization parameter C=1.2 and the Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5 and 10MB of the kernel cache. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . # # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,10,2.1,10.,1e-5,False], [traindat,testdat,10,2.1,11.,1e-4,False]] def classifier_libsvm_oneclass (fm_train_real=traindat,fm_test_real=testdat, size_cache=10, width=2.1,C=10., epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('new_classifier', 'LIBSVM_ONECLASS') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('LibSVMOneClass') classifier_libsvm_oneclass(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Minimal Primal Dual SVM is used with SVM # regularization parameter C=1.2 and a Gaussian kernel of width 2.1 and 10MB of # kernel cache and the precision parameter epsilon=1e-5. # # For more details on the MPD solver see # Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple # Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell, # J. G., J. Siekmann, Springer, Berlin, Germany (11 2005) from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_mpdsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'MPDSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('MPDSVM') classifier_mpdsvm(*parameter_list[0])
# This example shows how to use the Perceptron algorithm for training a # two-class linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm # works by iteratively passing though the training examples and applying the # update rule on those examples which are misclassified by the current # classifier. The Perceptron update rule reads # # w(t+1) = w(t) + alpha * y_t * x_t # b(t+1) = b(t) + alpha * y_t # # where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example # (w(t),b(t)) are the current parameters of the linear classifier # (w(t+1),b(t+1)) are the new parameters of the linear classifier # alpha is the learning rate. # # The Perceptron algorithm iterates until all training examples are correctly # classified or the prescribed maximal number of iterations is reached. # # The learning rate and the maximal number of iterations can be set by # sg('set_perceptron_parameters', alpha, max_iter); # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat, train_label], [traindat,testdat,train_label]] def classifier_perceptron (fm_train_real=traindat,fm_test_real=testdat, label_train_twoclass=train_label): sg('set_features', 'TRAIN', fm_train_real) sg('set_labels', 'TRAIN', label_train_twoclass) sg('new_classifier', 'PERCEPTRON') # often does not converge, mind your data! sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') return result if __name__=='__main__': print('Perceptron') classifier_perceptron(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and # the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') train_label=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna, train_label,10,20,1.2,1e-5,False], [traindna,testdna,train_label,10,21,1.3,1e-4,False]] def classifier_svmlight (fm_train_dna=traindna,fm_test_dna=testdna,label_train_dna=train_label, size_cache=10, degree=20,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree) sg('set_labels', 'TRAIN', label_train_dna) try: sg('new_classifier', 'SVMLIGHT') except RuntimeError: return sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_dna, 'DNA') result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('SVMLight') classifier_svmlight(*parameter_list[0])
# In this example an agglomerative hierarchical single linkage clustering method # is used to cluster a given toy data set. Starting with each object being # assigned to its own cluster clusters are iteratively merged. Here the clusters # are merged that have the closest (minimum distance, here set via the Euclidean # distance object) two elements. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') parameter_list=[[traindat,10,3],[traindat,11,4]] def clustering_hierarchical (fm_train=traindat, size_cache=10,merges=3): sg('set_features', 'TRAIN', fm_train) sg('set_distance', 'EUCLIDEAN', 'REAL') sg('new_clustering', 'HIERARCHICAL') sg('train_clustering', merges) [merge_distance, pairs]=sg('get_clustering') return [merge_distance, pairs] if __name__=='__main__': print('Hierarchical') clustering_hierarchical(*parameter_list[0])
# In this example the k-means clustering method is used to cluster a given toy # data set. In k-means clustering one tries to partition n observations into k # clusters in which each observation belongs to the cluster with the nearest mean. # The algorithm class constructor takes the number of clusters and a distance to # be used as input. The distance used in this example is Euclidean distance. # After training one can fetch the result of clustering by obtaining the cluster # centers and their radiuses. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') parameter_list=[[traindat,10,3,1000],[traindat,11,4,1500]] def clustering_kmeans (fm_train=traindat, size_cache=10,k=3,iter=1000): sg('set_features', 'TRAIN', fm_train) sg('set_distance', 'EUCLIDEAN', 'REAL') sg('new_clustering', 'KMEANS') sg('train_clustering', k, iter) [radi, centers]=sg('get_clustering') return [radi, centers] if __name__=='__main__': print('KMeans') clustering_kmeans(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'BRAYCURTIS'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two matrices is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CBrayCurtisDistance.html. # # Obviously, using the Bray Curtis distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_braycurtis (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'BRAYCURTIS', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('BrayCurtisDistance') distance_braycurtis(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CANBERRA'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (dissimilarity ratio) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (dissimilarity ratio) # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CCanberraMetric.html. # # Obviously, using the Canberra distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_canberra (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'CANBERRA', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('CanberraMetric') distance_canberra(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'CANBERRA' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CCanberraWordDistance.html. # # Obviously, using the Canberra word distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']] def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3, gap=0,reverse='n'): sg('set_distance', 'CANBERRA', 'WORD') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('CanberraWordDistance') distance_canberraword(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CHEBYSHEW'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix (maximum of absolute feature # dimension differences) is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix (maximum # of absolute feature dimension differences) between these two data sets is # computed. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CChebyshewMetric.html. # # Obviously, using the Chebyshew distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_chebyshew (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'CHEBYSHEW', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('ChebyshewMetric') distance_chebyshew(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CHISQUARE'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two matrices is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CChiSquareDistance.html. # # Obviously, using the ChiSquare distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_chisquare (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'CHISQUARE', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('ChiSquareDistance') distance_chisquare(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'COSINE'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CCosineDistance.html. # # Obviously, using the Cosine distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_cosine (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'COSINE', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('CosineDistance') distance_cosine(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'EUCLIDIAN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CEuclidianDistance.html. # # Obviously, using the Euclidian distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_euclidean (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'EUCLIDEAN', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('EuclideanDistance') distance_euclidean(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'GEODESIC'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (shortest path on a sphere) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (shortest path on # a sphere) matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CGeodesicMetric.html. # # Obviously, using the Geodesic distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_geodesic (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'GEODESIC', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('GeodesicMetric') distance_geodesic(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'HAMMING' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CHammingWordDistance.html. # # Obviously, using the Hamming word distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']] def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna,order=3, gap=0,reverse='n'): sg('set_distance', 'HAMMING', 'WORD') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('HammingWordDistance') distance_hammingword(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'JENSEN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (divergence measure based on the # Kullback-Leibler divergence) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (divergence measure # based on the Kullback-Leibler divergence) matrix between these two data sets # is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CJensenMetric.html. # # Obviously, using the Jensen-Shannon distance/divergence is not limited to # this showcase example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_jensen (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'JENSEN', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('JensenMetric') distance_jensen(*parameter_list[0])
# n approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'MANHATTAN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (sum of absolute feature # dimension differences) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (sum of absolute # feature dimension differences) matrix between these two data sets is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CManhattanMetric.html. # # Obviously, using the Manhattan distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_manhatten (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'MANHATTAN', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('ManhattanMetric') distance_manhatten(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'MANHATTAN' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CManhattanWordDistance.html. # # Obviously, using the Manhattan word distance is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,3,0,'n'],[traindna,testdna,4,0,'n']] def distance_manhattenword (fm_train_dna=traindna,fm_test_dna=testdna,order=3, gap=0,reverse='n'): sg('set_distance', 'MANHATTAN', 'WORD') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('ManhattanWordDistance') distance_manhattenword(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'MINKOWSKI' with # norm 'k'. Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CMinkowskiMetric.html. # # Obviously, using the Minkowski metric is not limited to this showcase # example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,3.],[traindat,testdat,4.]] def distance_minkowski (fm_train_real=traindat,fm_test_real=testdat,k=3.): sg('set_distance', 'MINKOWSKI', 'REAL', k) sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('MinkowskiMetric') distance_minkowski(*parameter_list[0])
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'TANIMOTO'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (extended Jaccard coefficient) # matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (extended # Jaccard coefficient) matrix between these two data sets is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CTanimotoDistance.html. # # Obviously, using the Tanimoto distance/coefficient is not limited to # this showcase example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat],[traindat,testdat]] def distance_tanimoto (fm_train_real=traindat,fm_test_real=testdat): sg('set_distance', 'TANIMOTO', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') return dm if __name__=='__main__': print('TanimotoDistance') distance_tanimoto(*parameter_list[0])
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') cubedna=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[traindna,cubedna,3,0,'n'],[traindna,cubedna,4,0,'n']] def distribution_histogram(fm_train=traindna,fm_cube=cubedna,order=3, gap=0,reverse='n'): # sg('new_distribution', 'HISTOGRAM') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i)) # sg('get_log_likelihood') # return sg('get_log_likelihood_sample') if __name__=='__main__': print('Histogram') distribution_histogram(*parameter_list[0])
# In this example a hidden markov model with 3 states and 6 transitions is trained # on a string data set. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') cubedna=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[traindna,cubedna,3,6,1,list(),list()], [traindna,cubedna,3,6,1,list(),list()]] def distribution_hmm(fm_train=traindna,fm_cube=cubedna,N=3,M=6, order=1,hmms=list(),links=list()): sg('new_hmm',N, M) sg('set_features', 'TRAIN', fm_cube, 'CUBE') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order) sg('bw') hmm=sg('get_hmm') sg('new_hmm', N, M) sg('set_hmm', hmm[0], hmm[1], hmm[2], hmm[3]) likelihood=sg('hmm_likelihood') return likelihood if __name__=='__main__': print('HMM') distribution_hmm(*parameter_list[0])
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to # the structure of the Markov chain it is very similar to a HMM with just one # chain of connected hidden states - that is why we termed this linear HMM. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') cubedna=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[traindna,cubedna,3,0,'n'], [traindna,cubedna,3,0,'n']] def distribution_linearhmm (fm_train=traindna,fm_cube=cubedna, order=3,gap=0,reverse='n'): # sg('new_distribution', 'LinearHMM') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i)) # sg('get_log_likelihood_sample') if __name__=='__main__': print('LinearHMM') distribution_linearhmm(*parameter_list[0])
# This is an example for the initialization of the chi2-kernel on real data, where # each column of the matrices corresponds to one training/test example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10],[traindat,testdat,1.5,11]] def kernel_chi2 (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'CHI2', 'REAL', size_cache, width) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Chi2') kernel_chi2(*parameter_list[0])
# This is an example for the initialization of a combined kernel, which is a weighted sum of # in this case three kernels on real valued data. The sub-kernel weights are all set to 1. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.,10],[traindat,testdat,1.5,11]] def kernel_combined(fm_train_real=traindat,fm_test_real=testdat, weight=1.,size_cache=10): sg('clean_kernel') sg('clean_features', 'TRAIN') sg('clean_features', 'TEST') sg('set_kernel', 'COMBINED', size_cache) sg('add_kernel', weight, 'LINEAR', 'REAL', size_cache) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', size_cache, 1.) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) sg('add_kernel', weight, 'POLY', 'REAL', size_cache, 3, False) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Combined') kernel_combined(*parameter_list[0])
# This is an example for the initialization of the CommUlongString-kernel. This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'], [traindna,testdna,11,4,0,'n',False,'FULL']] def kernel_commulongstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10, order=3,gap=0,reverse='n', use_sign=False,normalization='FULL'): sg('add_preproc', 'SORTULONGSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('CommUlongString') kernel_commulongstring(*parameter_list[0])
# This is an example for the initialization of the CommWordString-kernel (aka # Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'], [traindna,testdna,11,4,0,'n',False,'FULL']] def kernel_commwordstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10, order=3,gap=0,reverse='n', use_sign=False,normalization='FULL'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('CommWordString') kernel_commwordstring(*parameter_list[0])
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,23.,10],[traindat,testdat,24.,11]] def kernel_const (fm_train_real=traindat,fm_test_real=testdat,c=23.,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'CONST', 'REAL', size_cache, c) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Const') kernel_const(*parameter_list[0])
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,23.,10],[traindat,testdat,24.,11]] def kernel_diag (fm_train_real=traindat,fm_test_real=testdat,diag=23., size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'DIAG', 'REAL', size_cache, diag) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Diag') kernel_diag(*parameter_list[0])
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,3,10],[traindna,testdna,4,11]] def kernel_fixeddegreestring (fm_train_dna=traindna,fm_test_dna=testdna,degree=3, size_cache=10): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'FIXEDDEGREE', 'CHAR', size_cache, degree) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('FixedDegreeString') kernel_fixeddegreestring(*parameter_list[0])
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10],[traindat,testdat,1.9,11]] def kernel_gaussian (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Gaussian') kernel_gaussian(*parameter_list[0])
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel. # The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step # size of the shifts and max_shift is the maximal shift. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.9,2,1,10],[traindat,testdat,1.5,2,1,11]] def kernel_gaussianshift (fm_train_real=traindat,fm_test_real=testdat, width=1.4,max_shift=2,shift_step=1,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'GAUSSIANSHIFT', 'REAL', size_cache, width, max_shift, shift_step) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('GaussianShift') kernel_gaussianshift(*parameter_list[0])
# This is an example for the initialization of a linear kernel on real valued # data using scaling factor 1.2. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.2,10],[traindat,testdat,1.5,11]] def kernel_linear (fm_train_real=traindat,fm_test_real=testdat, scale=1.2,size_cache=10): from sg import sg sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'LINEAR', 'REAL', size_cache, scale) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Linear') kernel_linear(*parameter_list[0])
# This is an example for the initialization of a linear kernel on string data. The # strings are all of the same length and consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10], [traindna,testdna,11]] def kernel_linearstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'LINEAR', 'CHAR', size_cache) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LinearString') kernel_linearstring(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte) # data. from tools.load import LoadMatrix from numpy import ushort from sg import sg lm=LoadMatrix() trainword=ushort(lm.load_numbers('../data/fm_test_word.dat')) testword=ushort(lm.load_numbers('../data/fm_test_word.dat')) parameter_list=[[trainword,testword,10,1.4], [trainword,testword,11,1.5]] def kernel_linearword (fm_train_word=trainword,fm_test_word=testword, size_cache=10, scale=1.4): sg('set_features', 'TRAIN', fm_train_word) sg('set_features', 'TEST', fm_test_word) sg('set_kernel', 'LINEAR', 'WORD', size_cache, scale) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LinearWord') kernel_linearword(*parameter_list[0])
# This is an example for the initialization of the local alignment kernel on # DNA sequences, where each column of the matrices of type char corresponds to # one training/test example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10], [traindna,testdna,11]] def kernel_localalignmentstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'LOCALALIGNMENT', 'CHAR', size_cache) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LocalAlignmentString') kernel_localalignmentstring(*parameter_list[0])
# This example initializes the locality improved string kernel. The locality improved string # kernel is defined on sequences of the same length and inspects letters matching at # corresponding positions in both sequences. The kernel sums over all matches in windows of # length l and takes this sum to the power of 'inner_degree'. The sum over all these # terms along the sequence is taken to the power of 'outer_degree'. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,5,5,7], [traindna,testdna,trainlabel,11,6,6,8]] def kernel_localityimprovedstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, length=5,inner_degree=5,outer_degree=7): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'LIK', 'CHAR', size_cache, length, inner_degree, outer_degree) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LocalityImprovedString') kernel_localityimprovedstring(*parameter_list[0])
# This is an example initializing the oligo string kernel which takes distances # between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length # of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is # implemented for the DNA-alphabet 'ACGT'. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,1.2], [traindna,testdna,11,4,1.3]] def kernel_oligostring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,k=3,width=1.2): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'OLIGO', 'CHAR', size_cache, k, width) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('OligoString') kernel_oligostring(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n'], [traindna,testdna,trainlabel,11,4,0,'n']] def kernel_pluginestimatehistogram (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, order=3,gap=0,reverse='n',): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) pseudo_pos=1e-1 pseudo_neg=1e-1 sg('new_plugin_estimator', pseudo_pos, pseudo_neg) sg('set_labels', 'TRAIN', label_train_dna) sg('train_estimator') sg('set_kernel', 'HISTOGRAM', 'WORD', size_cache) km=sg('get_kernel_matrix', 'TRAIN') # not supported yet # lab=sg('plugin_estimate_classify') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('PluginEstimate w/ HistogramWord') kernel_pluginestimatehistogram(*parameter_list[0])
# This example initializes the polynomial kernel with real data. # If variable 'inhomogene' is 'true' +1 is added to the scalar product # before taking it to the power of 'degree'. If 'use_normalization' is # set to 'true' then kernel matrix will be normalized by the square roots # of the diagonal entries. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,4,False,True,10], [traindat,testdat,5,False,True,11]] def kernel_poly (fm_train_real=traindat,fm_test_real=testdat, degree=4,inhomogene=False,use_normalization=True,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'POLY', 'REAL', size_cache, degree, inhomogene, use_normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Poly') kernel_poly(*parameter_list[0])
# This is an example for the initialization of the PolyMatchString kernel on string data. # The PolyMatchString kernel sums over the matches of two stings of the same length and # takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,False], [traindna,testdna,11,4,False]] def kernel_polymatchstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,degree=3,inhomogene=False): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'POLYMATCH', 'CHAR', size_cache, degree, inhomogene) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('PolyMatchString') kernel_polymatchstring(*parameter_list[0])
# The PolyMatchWordString kernel is defined on strings of equal length. # The kernel sums over the matches of two stings of the same length and # takes the sum to the power of 'degree'. The strings in this example # consist of the characters 'ACGT' corresponding to the DNA-alphabet. Each # column of the matrices of type char corresponds to one training/test example. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,2,True,True,3,0,'n'], [traindna,testdna,trainlabel,11,3,True,True,4,0,'n']] def kernel_polymatchword (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, degree=2,inhomogene=True,normalize=True, order=3,gap=0,reverse='n'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'POLYMATCH', 'WORD', size_cache, degree, inhomogene, normalize) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('PolyMatchWord') kernel_polymatchword(*parameter_list[0])
# The SalzbergWordString kernel implements the Salzberg kernel. # # It is described in # # Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites # A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'], [traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']] def kernel_salzbergstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, order=3,gap=0,reverse='n',use_sign=False, normalization='FULL'): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) pseudo_pos=1e-1 pseudo_neg=1e-1 sg('new_plugin_estimator', pseudo_pos, pseudo_neg) sg('set_labels', 'TRAIN', label_train_dna) sg('train_estimator') sg('set_kernel', 'SALZBERG', 'WORD', size_cache) #sg('set_prior_probs', 0.4, 0.6) sg('set_prior_probs_from_labels', label_train_dna) km=sg('get_kernel_matrix', 'TRAIN') # not supported yet # lab=sg('plugin_estimate_classify') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('PluginEstimate w/ SalzbergWord') kernel_salzbergstring(*parameter_list[0])
# The standard Sigmoid kernel computed on dense real valued features. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,11,1.2,1.3,10],[traindat,testdat,12,1.3,1.4,11]] def kernel_sigmoid (fm_train_real=traindat,fm_test_real=testdat, num_feats=11,gamma=1.2,coef0=1.3,size_cache=10): sg('set_features', 'TRAIN', fm_train_real) sg('set_features', 'TEST', fm_test_real) sg('set_kernel', 'SIGMOID', 'REAL', size_cache, gamma, coef0) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('Sigmoid') kernel_sigmoid(*parameter_list[0])
# SimpleLocalityImprovedString kernel, is a ``simplified'' and better performing version of the Locality improved kernel. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,5,5,7], [traindna,testdna,trainlabel,11,6,6,8]] def kernel_simplelocalityimprovedstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, length=5,inner_degree=5,outer_degree=7): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'SLIK', 'CHAR', size_cache, length, inner_degree, outer_degree) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('SimpleLocalityImprovedString') kernel_simplelocalityimprovedstring(*parameter_list[0])
# The WeightedCommWordString kernel may be used to compute the weighted # spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer # length is weighted by some coefficient \f$\beta_k\f$) from strings that have # been mapped into unsigned 16bit integers. # # These 16bit integers correspond to k-mers. To applicable in this kernel they # need to be sorted (e.g. via the SortWordString pre-processor). # # It basically uses the algorithm in the unix "comm" command (hence the name) # to compute: # # k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'}) # # where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in # \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature # vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$. # # Note that this representation is especially tuned to small alphabets # (like the 2-bit alphabet DNA), for which it enables spectrum kernels # of order 8. # # For this kernel the linadd speedups are quite efficiently implemented using # direct maps. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'], [traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']] def kernel_weightedcommwordstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, order=3,gap=0,reverse='n',use_sign=False, normalization='FULL'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('WeightedCommWordString') kernel_weightedcommwordstring(*parameter_list[0])
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts). # # The WD-shift kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d, weighted by coefficients beta_k # allowing for a positional tolerance of up to shift s. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,20], [traindna,testdna,11,21]] def kernel_weighteddegreepositonstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,degree=20): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'WEIGHTEDDEGREEPOS', 'CHAR', size_cache, degree) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('WeightedDegreePositionString') kernel_weighteddegreepositonstring(*parameter_list[0])
# The Weighted Degree String kernel. # # The WD kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d , weighted by coefficients beta_k. It # is defined as # # k(X, Y)=\sum_{k=1}^d\beta_k\sum_{l=1}^{L-k+1}I(u_{k,l}(X)=u_{k,l}(Y)). # # Here, $u_{k,l}(X)$ is the string of length k starting at position # l of the sequence X and I(.) is the indicator function # which evaluates to 1 when its argument is true and to 0 # otherwise. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,20], [traindna,testdna,11,21]] def kernel_weighteddegreestring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,degree=20): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('WeightedDegreeString') kernel_weighteddegreestring(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') trainlabel=lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[[traindat,testdat,trainlabel,10,1.2,1.2,1e-5,0.001,1.5,1.0], [traindat,testdat,trainlabel,11,1.3,1.3,1e-5,0.002,1.6,1.1]] def mkl_multiclass (fm_train_real=traindat,fm_test_real=testdat, label_train_multiclass=trainlabel, size_cache=10,width=1.2,C=1.2,epsilon=1e-5, mkl_eps=0.001,mkl_norm=1.5,weight=1.0): sg('clean_kernel') sg('clean_features', 'TRAIN') sg('clean_features', 'TEST') sg('set_kernel', 'COMBINED', size_cache) sg('add_kernel', weight, 'LINEAR', 'REAL', size_cache) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', size_cache, width) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) sg('add_kernel', weight, 'POLY', 'REAL', size_cache, 2) sg('add_features', 'TRAIN', fm_train_real) sg('add_features', 'TEST', fm_test_real) sg('set_labels', 'TRAIN', label_train_multiclass) sg('new_classifier', 'MKL_MULTICLASS') sg('svm_epsilon', epsilon) sg('c', C) sg('mkl_parameters', mkl_eps, 0.0, mkl_norm) sg('train_classifier') #sg('set_features', 'TEST', fm_test_real) result=sg('classify') return result if __name__=='__main__': print('mkl_multiclass') mkl_multiclass(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg from numpy import * num=100 labelstrain=concatenate((-ones([1,num]), ones([1,num])),1)[0] featuretrain=concatenate((random.normal(size=(2,num))-1,random.normal(size=(2,num))+1),1) parameter_list=[[1.,labelstrain,featuretrain,1e-2], [1.,labelstrain,featuretrain,1e-2]] def mkl_regression (weight=1., labels=labelstrain,features=featuretrain, tube_epsilon=1e-2): sg('new_classifier', 'MKL_REGRESSION') sg('c', 1.) sg('svr_tube_epsilon', tube_epsilon) sg('set_labels', 'TRAIN', labels) sg('add_features', 'TRAIN', features) sg('add_features', 'TRAIN', features) sg('add_features', 'TRAIN', features) sg('set_kernel', 'COMBINED', 100) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 100.) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 10.) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 1.) sg('train_classifier') [bias, alphas]=sg('get_svm'); km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('MKL_REGRESSION') mkl_regression(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg from numpy import * num=100 labelstrain=concatenate((-ones([1,num]), ones([1,num])),1)[0] featuretrain=concatenate((random.normal(size=(2,num))-1,random.normal(size=(2,num))+1),1) parameter_list=[[1.,labelstrain,featuretrain], [1.,labelstrain,featuretrain]] def mkl_twoclass (weight=1., labels=labelstrain,features=featuretrain): sg('c', 10.) sg('new_classifier', 'MKL_CLASSIFICATION') sg('set_labels', 'TRAIN', labels) sg('add_features', 'TRAIN', features) sg('add_features', 'TRAIN', features) sg('add_features', 'TRAIN', features) sg('set_kernel', 'COMBINED', 100) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 100.) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 10.) sg('add_kernel', weight, 'GAUSSIAN', 'REAL', 100, 1.) sg('train_classifier') [bias, alphas]=sg('get_svm'); km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('MKL_TWOCLASS') mkl_twoclass(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_gmnpsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_multiclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_multiclass) sg('new_classifier', 'GMNPSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('GMNPSVM') classifier_gmnpsvm(*parameter_list[0])
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,10.,1e-5,False], [traindat,testdat,train_label,10,2.1,11.,1e-4,False]] def classifier_libsvm_multiclass (fm_train_real=traindat,fm_test_real=testdat, label_train_multiclass=train_label, size_cache=10, width=2.1,C=10., epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_multiclass) sg('new_classifier', 'LIBSVM_MULTICLASS') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix if __name__=='__main__': print('LibSVMMulticlass') classifier_libsvm_multiclass(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # LogPlusOne adds one to a dense real-valued vector and takes the logarithm of # each component of it. It is most useful in situations where the inputs are # counts: When one compares differences of small counts any difference may matter # a lot, while small differences in large counts don't. This is what this log # transformation controls for. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10],[traindat,testdat,1.5,11]] def preproc_logplusone (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10): sg('add_preproc', 'LOGPLUSONE') sg('set_kernel', 'CHI2', 'REAL', size_cache, width) sg('set_features', 'TRAIN', fm_train_real) sg('attach_preproc', 'TRAIN') km=sg('get_kernel_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) sg('attach_preproc', 'TEST') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LogPlusOne') preproc_logplusone(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # NormOne, normalizes vectors to have norm 1. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10],[traindat,testdat,1.5,11]] def preproc_normone (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10): sg('add_preproc', 'NORMONE') sg('set_kernel', 'CHI2', 'REAL', size_cache, width) sg('set_features', 'TRAIN', fm_train_real) sg('attach_preproc', 'TRAIN') km=sg('get_kernel_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) sg('attach_preproc', 'TEST') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('NormOne') preproc_normone(*parameter_list[0])
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # PruneVarSubMean substracts the mean from each feature and removes features that # have zero variance. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10,True],[traindat,testdat,1.5,11,True]] def preproc_prunevarsubmean (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10,divide_by_std=True): sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std) sg('set_kernel', 'CHI2', 'REAL', size_cache, width) sg('set_features', 'TRAIN', fm_train_real) sg('attach_preproc', 'TRAIN') km=sg('get_kernel_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) sg('attach_preproc', 'TEST') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('PruneVarSubMean') preproc_prunevarsubmean(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The # CommUlongString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 64bit integers. These 64bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortUlongString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation enables # spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for # 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented # (though there is room for improvement here when a whole set of sequences is # ADDed) using sorted lists. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'], [traindna,testdna,11,4,0,'n',False,'FULL']] def preproc_sortulongstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,order=3,gap=0,reverse='n', use_sign=False,normalization='FULL'): sg('add_preproc', 'SORTULONGSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('CommUlongString') preproc_sortulongstring(*parameter_list[0])
# In this example a kernel matrix is computed for a given string data set. The # CommWordString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 16bit integers. These 16bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortWordString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation is # especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it # enables spectrum kernels of order up to 8. For this kernel the linadd speedups # are quite efficiently implemented using direct maps. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'], [traindna,testdna,11,4,0,'n',False,'FULL']] def preproc_sortwordstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10,order=3,gap=0,reverse='n', use_sign=False,normalization='FULL'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('CommWordString') preproc_sortwordstring(*parameter_list[0])
# In this example a kernelized version of ridge regression (KRR) is trained on a # real-valued data set. The KRR is trained with regularization parameter tau=1e-6 # and a gaussian kernel with width=0.8. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') trainlabel=lm.load_labels('../data/label_train_regression.dat') parameter_list=[[traindat,testdat,trainlabel,10,2.1,1.2,1e-6], [traindat,testdat,trainlabel,11,2.3,1.3,1e-6]] def regression_krr (fm_train=traindat,fm_test=testdat, label_train=trainlabel,size_cache=10,width=2.1, C=1.2,tau=1e-6): sg('set_features', 'TRAIN', fm_train) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train) sg('new_regression', 'KERNELRIDGEREGRESSION') sg('krr_tau', tau) sg('c', C) sg('train_regression') sg('set_features', 'TEST', fm_test) result=sg('classify') return result if __name__=='__main__': print('KRR') regression_krr(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') trainlabel=lm.load_labels('../data/label_train_regression.dat') parameter_list=[[traindat,testdat,trainlabel,10,2.1,1.2,1e-5,1e-2], [traindat,testdat,trainlabel,11,2.3,1.3,1e-6,1e-3]] def regression_libsvr (fm_train=traindat,fm_test=testdat, label_train=trainlabel,size_cache=10,width=2.1, C=1.2,epsilon=1e-5,tube_epsilon=1e-2): sg('set_features', 'TRAIN', fm_train) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train) sg('new_regression', 'LIBSVR') sg('svr_tube_epsilon', tube_epsilon) sg('c', C) sg('train_regression') sg('set_features', 'TEST', fm_test) result=sg('classify') return result if __name__=='__main__': print('LibSVR') regression_libsvr(*parameter_list[0])
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') trainlabel=lm.load_labels('../data/label_train_twoclass.dat') parameter_list=[[traindat,testdat,trainlabel,10,2.1,1.2,1e-5,1e-2], [traindat,testdat,trainlabel,11,2.3,1.3,1e-6,1e-3]] def regression_svrlight (fm_train=traindat,fm_test=testdat, label_train=trainlabel,size_cache=10,width=2.1, C=1.2,epsilon=1e-5,tube_epsilon=1e-2): sg('set_features', 'TRAIN', fm_train) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train) try: sg('new_regression', 'SVRLIGHT') except RuntimeError: return sg('svr_tube_epsilon', tube_epsilon) sg('c', C) sg('train_regression') sg('set_features', 'TEST', fm_test) result=sg('classify') return result if __name__=='__main__': print('SVRLight') regression_svrlight(*parameter_list[0])