SHOGUN
4.2.0
|
This page lists ready to run shogun examples for the Static R interface.
To run the examples issue
R -f name_of_example.R
or start R and then type
source('name_of_example.R')
# In this example a multi-class support vector machine is trained on a toy data # set and the trained classifier is used to predict labels of test examples. # The training algorithm is based on BSVM formulation (L2-soft margin # and the bias added to the objective function) which is solved by the Improved # Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian # kernel of width 2.1 and the regularization constant C=1.2. The bias term of the # classification rule is not used. The solver stops if the relative duality gap # falls below 1e-5 and it uses 10MB for kernel cache. # # For more details on the used SVM solver see # V.Franc: Optimization Algorithms for Kernel Methods. Research report. # CTU-CMP-2005-22. CTU FEL Prague. 2005. # ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf . # library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_multiclass <- as.double(as.matrix(read.table('../data/label_train_multiclass.dat'))) # GMNPSVM print('GMNPSVM') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train_multiclass) dump <- sg('new_classifier', 'GMNPSVM') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm Gradient Projection Decomposition Technique # (GPDT) is used with SVM regularization parameter C=1.2 and a Gaussian # kernel of width 2.1 and 10MB of kernel cache. # # For more details on GPDT solver see http://dm.unife.it/gpdt # # library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_twoclass <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # GPBTSVM print('GPBTSVM') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train_twoclass) dump <- sg('new_classifier', 'GPBTSVM') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# This example shows usage of a k-nearest neighbor (KNN) classification rule on # a toy data set. The number of the nearest neighbors is set to k=3 and the distances # are measured by the Euclidean metric. Finally, the KNN rule is applied to predict # labels of test examples. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_multiclass <- as.double(as.matrix(read.table('../data/label_train_multiclass.dat'))) # KNN print('KNN') k <- 3 dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_labels', 'TRAIN', label_train_multiclass) dump <- sg('set_distance', 'EUCLIDEAN', 'REAL') dump <- sg('new_classifier', 'KNN') dump <- sg('train_classifier', k) dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a linear two-class classifier is trained based on the Linear # Discriminant Analysis (LDA) from a toy 2-dimensional examples. The trained # LDA classifier is used to predict test examples. Note that the LDA classifier # is optimal under the assumption that both classes are Gaussian distributed with equal # co-variance. For more details on the LDA see e.g. # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_twoclass <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # LDA print('LDA') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_labels', 'TRAIN', label_train_twoclass) dump <- sg('new_classifier', 'LDA') dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm LIBSVM is used with SVM regularization # parameter C=1 and a Gaussian kernel of width 1.2 and 10MB of kernel cache and # the precision parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_twoclass <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # LibSVM print('LibSVM') width <- 2.1 dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train_twoclass) dump <- sg('new_classifier', 'LIBSVM') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a multi-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm LIBSVM is used with SVM regularization # parameter C=1.2 and the bias in the classification rule switched off and # a Gaussian kernel of width 2.1 and 10MB of kernel cache and the precision # parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_multiclass <- as.double(as.matrix(read.table('../data/label_train_multiclass.dat'))) # LibSVM Multiclass print('LibSVMMulticlass') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train_multiclass) dump <- sg('new_classifier', 'LIBSVM_MULTICLASS') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a one-class support vector machine classifier is trained on a # toy data set. The training algorithm finds a hyperplane in the RKHS which # separates the training data from the origin. The one-class classifier is # typically used to estimate the support of a high-dimesnional distribution. # For more details see e.g. # B. Schoelkopf et al. Estimating the support of a high-dimensional # distribution. Neural Computation, 13, 2001, 1443-1471. # # In the example, the one-class SVM is trained by the LIBSVM solver with the # regularization parameter C=1.2 and the Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5 and 10MB of the kernel cache. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . # # library("sg") size_cache <- 10 svm_nu <- 0.1 epsilon <- 1e-5 use_bias <- TRUE width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # LibSVMOneClass print('LibSVMOneClass') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('new_classifier', 'LIBSVM_ONECLASS') dump <- sg('svm_epsilon', epsilon) dump <- sg('svm_nu', svm_nu) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Minimal Primal Dual SVM is used with SVM # regularization parameter C=1.2 and a Gaussian kernel of width 2.1 and 10MB of # kernel cache and the precision parameter epsilon=1e-5. # # For more details on the MPD solver see # Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple # Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell, # J. G., J. Siekmann, Springer, Berlin, Germany (11 2005) library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_twoclass <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # MPDSVM print('MPDSVM') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train_twoclass) dump <- sg('new_classifier', 'MPDSVM') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_real) result <- sg('classify')
# This example shows how to use the Perceptron algorithm for training a # two-class linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm # works by iteratively passing though the training examples and applying the # update rule on those examples which are misclassified by the current # classifier. The Perceptron update rule reads # # w(t+1) = w(t) + alpha * y_t * x_t # b(t+1) = b(t) + alpha * y_t # # where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example # (w(t),b(t)) are the current parameters of the linear classifier # (w(t+1),b(t+1)) are the new parameters of the linear classifier # alpha is the learning rate. # # The Perceptron algorithm iterates until all training examples are correctly # classified or the prescribed maximal number of iterations is reached. # # The learning rate and the maximal number of iterations can be set by # sg('set_perceptron_parameters', alpha, max_iter); # library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat')) fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat')) label_train_twoclass <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # Perceptron print('Perceptron') dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_labels', 'TRAIN', label_train_twoclass) dump <- sg('new_classifier', 'PERCEPTRON') # often does not converge #dump <- sg('train_classifier') #dump <- sg('set_features', 'TEST', fm_test_real) #result <- sg('classify')
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and # the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. library("sg") size_cache <- 10 C <- 10 epsilon <- 1e-5 use_bias <- TRUE fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) label_train_dna <- as.double(as.matrix(read.table('../data/label_train_dna.dat'))) degree <- 20 # SVM Light dosvmlight <- function() { print('SVMLight') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree) dump <- sg('set_labels', 'TRAIN', label_train_dna) dump <- sg('new_classifier', 'SVMLIGHT') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('svm_use_bias', use_bias) dump <- sg('train_classifier') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') result <- sg('classify') } try(dosvmlight())
# In this example an agglomerative hierarchical single linkage clustering method # is used to cluster a given toy data set. Starting with each object being # assigned to its own cluster clusters are iteratively merged. Here the clusters # are merged that have the closest (minimum distance, here set via the Euclidean # distance object) two elements. library("sg") fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat'))) # Hierarchical print('Hierarchical') merges=3 dump <- sg('set_features', 'TRAIN', fm_train) dump <- sg('set_distance', 'EUCLIDEAN', 'REAL') dump <- sg('new_clustering', 'HIERARCHICAL') dump <- sg('train_clustering', merges) result <- sg('get_clustering') merge_distances <- result[[1]] pairs <- result[[2]]
# In this example the k-means clustering method is used to cluster a given toy # data set. In k-means clustering one tries to partition n observations into k # clusters in which each observation belongs to the cluster with the nearest mean. # The algorithm class constructor takes the number of clusters and a distance to # be used as input. The distance used in this example is Euclidean distance. # After training one can fetch the result of clustering by obtaining the cluster # centers and their radiuses. library("sg") fm_train <- as.matrix(read.table('../data/fm_train_real.dat')) # KMEANS print('KMeans') k <- 3 iter <- 1000 dump <- sg('set_distance', 'EUCLIDEAN', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train) dump <- sg('new_clustering', 'KMEANS') dump <- sg('train_clustering', k, iter) result <- sg('get_clustering') radi <- result[[1]] centers <- result[[2]]
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'BRAYCURTIS'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two matrices is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CBrayCurtisDistance.html. # # Obviously, using the Bray Curtis distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # BrayCurtis Distance print('BrayCurtisDistance') dump <- sg('set_distance', 'BRAYCURTIS', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CANBERRA'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (dissimilarity ratio) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (dissimilarity ratio) # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CCanberraMetric.html. # # Obviously, using the Canberra distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Canberra Metric print('CanberraMetric') dump <- sg('set_distance', 'CANBERRA', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'CANBERRA' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CCanberraWordDistance.html. # # Obviously, using the Canberra word distance is not limited to this showcase # example. library("sg") fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' # Canberra Word Distance print('CanberraWordDistance') dump <- sg('set_distance', 'CANBERRA', 'WORD') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CHEBYSHEW'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix (maximum of absolute feature # dimension differences) is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix (maximum # of absolute feature dimension differences) between these two data sets is # computed. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CChebyshewMetric.html. # # Obviously, using the Chebyshew distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Chebyshew Metric print('ChebyshewMetric') dump <- sg('set_distance', 'CHEBYSHEW', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'CHISQUARE'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two matrices is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CChiSquareDistance.html. # # Obviously, using the ChiSquare distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # ChiSquare Distance print('ChiSquareDistance') dump <- sg('set_distance', 'CHISQUARE', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'COSINE'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CCosineDistance.html. # # Obviously, using the Cosine distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Cosine Distance print('CosineDistance') dump <- sg('set_distance', 'COSINE', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'EUCLIDIAN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CEuclidianDistance.html. # # Obviously, using the Euclidian distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Euclidean Distance print('EuclideanDistance') dump <- sg('set_distance', 'EUCLIDEAN', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'GEODESIC'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (shortest path on a sphere) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (shortest path on # a sphere) matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CGeodesicMetric.html. # # Obviously, using the Geodesic distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Geodesic Metric print('GeodesicMetric') dump <- sg('set_distance', 'GEODESIC', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'HAMMING' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CHammingWordDistance.html. # # Obviously, using the Hamming word distance is not limited to this showcase # example. library("sg") fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' # Hamming Word Distance print('HammingWordDistance') dump <- sg('set_distance', 'HAMMING', 'WORD') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'JENSEN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (divergence measure based on the # Kullback-Leibler divergence) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (divergence measure # based on the Kullback-Leibler divergence) matrix between these two data sets # is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CJensenMetric.html. # # Obviously, using the Jensen-Shannon distance/divergence is not limited to # this showcase example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Jensen Metric print('JensenMetric') dump <- sg('set_distance', 'JENSEN', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# n approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'MANHATTAN'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (sum of absolute feature # dimension differences) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (sum of absolute # feature dimension differences) matrix between these two data sets is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CManhattanMetric.html. # # Obviously, using the Manhattan distance is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Manhattan Metric print('ManhattanMetric') dump <- sg('set_distance', 'MANHATTAN', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored data sets in 'STRING' representation # (feature type 'CHAR' with alphabet 'DNA') from different files and # initializes the distance to 'MANHATTAN' with feature type 'WORD'. # # Data points in this example are defined by the transformation function # 'convert' and the preprocessing step applied afterwards (defined by # 'add_preproc' and preprocessor 'SORTWORDSTRING'). # # The target 'TRAIN' for 'set_features' controls the binding of the given # data points. In order to compute a pairwise distance matrix by # 'get_distance_matrix', we have to perform two preprocessing steps for # input data 'TRAIN'. The method 'convert' transforms the input data to # a string representation suitable for the selected distance. The individual # strings are sorted in ascending order after the execution of 'attach_preproc'. # A pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the binding of the given # data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance # matrix between these two data sets by 'get_distance_matrix', we have to # perform two preprocessing steps for input data 'TEST'. The method 'convert' # transforms the input data 'TEST' to a string representation suitable for # the selected distance. The individual strings are sorted in ascending order # after the execution of 'attach_preproc'. A pairwise distance matrix between # the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see # doc/classshogun_1_1CSortWordString.html, # doc/classshogun_1_1CPreprocessor.html, # doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and # doc/classshogun_1_1CManhattanWordDistance.html. # # Obviously, using the Manhattan word distance is not limited to this showcase # example. library("sg") fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' # Manhattan Word Distance print('ManhattanWordDistance') dump <- sg('set_distance', 'MANHATTAN', 'WORD') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'MINKOWSKI' with # norm 'k'. Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance matrix is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CMinkowskiMetric.html. # # Obviously, using the Minkowski metric is not limited to this showcase # example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Minkowski Metric print('MinkowskiMetric') k <- 3 dump <- sg('set_distance', 'MINKOWSKI', 'REAL', k) dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values (feature type 'REAL') # from different files and initializes the distance to 'TANIMOTO'. # Each column of the matrices corresponds to one data point. # # The target 'TRAIN' for 'set_features' controls the processing of the given # data points, where a pairwise distance (extended Jaccard coefficient) # matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' and # target 'TRAIN'. # # The target 'TEST' for 'set_features' controls the processing of the given # data points 'TRAIN' and 'TEST', where a pairwise distance (extended # Jaccard coefficient) matrix between these two data sets is computed by # 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix' # and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. # # For more details see doc/classshogun_1_1CTanimotoDistance.html. # # Obviously, using the Tanimoto distance/coefficient is not limited to # this showcase example. library("sg") fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Tanimoto Distance print('TanimotoDistance') dump <- sg('set_distance', 'TANIMOTO', 'REAL') dump <- sg('set_features', 'TRAIN', fm_train_real) dm <- sg('get_distance_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dm <- sg('get_distance_matrix', 'TEST')
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. library("sg") order <- 3 gap <- 0 reverse <- 'n' fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character'))) # # distributions # # Histogram print('Histogram') # sg('new_distribution', 'HISTOGRAM') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i)) # sg('get_log_likelihood') # sg('get_log_likelihood_sample')
# In this example a hidden markov model with 3 states and 6 transitions is trained # on a string data set. library("sg") order <- 3 gap <- 0 reverse <- 'n' fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character'))) # HMM print('HMM') N <- 3 M <- 6 order <- 1 hmms <- c() liks <- c() dump <- sg('set_features', 'TRAIN', fm_train_cube, 'CUBE') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order) dump <- sg('new_hmm', N, M) dump <- sg('bw') hmm <- sg('get_hmm') dump <- sg('new_hmm', N, M) dump <- sg('set_hmm', hmm[[1]], hmm[[2]], hmm[[3]], hmm[[4]]) likelihood <- sg('hmm_likelihood')
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to # the structure of the Markov chain it is very similar to a HMM with just one # chain of connected hidden states - that is why we termed this linear HMM. library("sg") order <- 3 gap <- 0 reverse <- 'n' fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character'))) # Linear HMM print('LinearHMM') # sg('new_distribution', 'LinearHMM') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i)) # sg('get_log_likelihood') # sg('get_log_likelihood_sample')
# This is an example for the initialization of the chi2-kernel on real data, where # each column of the matrices corresponds to one training/test example. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # CHI2 print('Chi2') width <- 1.4 dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of a combined kernel, which is a weighted sum of # in this case three kernels on real valued data. The sub-kernel weights are all set to 1. # library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Combined print('Combined') dump <- sg('clean_features', 'TRAIN') dump <- sg('clean_features', 'TEST') dump <- sg('set_kernel', 'COMBINED', size_cache) dump <- sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) dump <- sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, 1) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) dump <- sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 3, FALSE) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TRAIN') km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of the CommUlongString-kernel. This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' use_sign <- FALSE normalization <- 'FULL' # Comm Ulong String print('CommUlongString') dump <- sg('add_preproc', 'SORTULONGSTRING') dump <- sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of the CommWordString-kernel (aka # Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' use_sign <- FALSE normalization <- 'FULL' # Comm Word String print('CommWordString') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # library("sg") size_cache <- 10 fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat')) fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat')) # Const print('Const') c <- 23. dump <- sg('set_kernel', 'CONST', 'REAL', size_cache, c) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. library("sg") size_cache <- 10 fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat')) fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat')) # Diag print('Diag') diag=23. dump <- sg('set_kernel', 'DIAG', 'REAL', size_cache, diag) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# With the distance kernel one can use any of the following distance metrics: # MINKOWSKI MANHATTAN HAMMING CANBERRA CHEBYSHEW GEODESIC JENSEN CHISQUARE TANIMOTO COSINE BRAYCURTIS EUCLIDIAN library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Distance print('Distance') width=1.7 dump <- sg('set_distance', 'EUCLIDEAN', 'REAL') dump <- sg('set_kernel', 'DISTANCE', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) km=sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km=sg('get_kernel_matrix', 'TEST')
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Fixed Degree String print('FixedDegreeString') degree <- 3 dump <- sg('set_kernel', 'FIXEDDEGREE', 'CHAR', size_cache, degree) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Gaussian print('Gaussian') width <- 1.9 dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel. # The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step # size of the shifts and max_shift is the maximal shift. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # GaussianShift print('GaussianShift') width <- 1.8 max_shift <- 2 shift_step <- 1 dump <- sg('set_kernel', 'GAUSSIANSHIFT', 'REAL', size_cache, width, max_shift, shift_step) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains. library("sg") size_cache <- 10 order <- 3 gap <- 0 reverse <- 'n' fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) label_train_dna <- as.double(as.matrix(read.table('../data/label_train_dna.dat'))) # PluginEstimate print('PluginEstimate w/ HistogramWord') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) pseudo_pos <- 1e-1 pseudo_neg <- 1e-1 dump <- sg('new_plugin_estimator', pseudo_pos, pseudo_neg) dump <- sg('set_labels', 'TRAIN', label_train_dna) dump <- sg('train_estimator') dump <- sg('set_kernel', 'HISTOGRAM', 'WORD', size_cache) km <- sg('get_kernel_matrix', 'TRAIN') # not supported yet # lab=sg('plugin_estimate_classify') km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of a linear kernel on real valued # data using scaling factor 1.2. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Linear print('Linear') dump <- sg('set_kernel', 'LINEAR', 'REAL', size_cache) dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('set_kernel_normalization', 'SQRTDIAG') km1 <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_kernel_normalization', 'AVGDIAG') km2 <- sg('get_kernel_matrix', 'TRAIN') #dump <- sg('set_features', 'TEST', fm_test_real) #km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of a linear kernel on string data. The # strings are all of the same length and consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Linear String print('LinearString') dump <- sg('set_kernel', 'LINEAR', 'CHAR', size_cache) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of the local alignment kernel on # DNA sequences, where each column of the matrices of type char corresponds to # one training/test example. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Local Alignment String print('LocalAlignmentString') dump <- sg('set_kernel', 'LOCALALIGNMENT', 'CHAR', size_cache) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# This example initializes the locality improved string kernel. The locality improved string # kernel is defined on sequences of the same length and inspects letters matching at # corresponding positions in both sequences. The kernel sums over all matches in windows of # length l and takes this sum to the power of 'inner_degree'. The sum over all these # terms along the sequence is taken to the power of 'outer_degree'. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Locality Improved String print('LocalityImprovedString') length <- 5 inner_degree <- 5 outer_degree <- inner_degree+2 dump <- sg('set_kernel', 'LIK', 'CHAR', size_cache, length, inner_degree, outer_degree) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# This is an example initializing the oligo string kernel which takes distances # between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length # of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is # implemented for the DNA-alphabet 'ACGT'. # library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Oligo String print('OligoString') k <- 3 width <- 1.2 dump <- sg('set_kernel', 'OLIGO', 'CHAR', size_cache, k, width) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# This example initializes the polynomial kernel with real data. # If variable 'inhomogene' is 'true' +1 is added to the scalar product # before taking it to the power of 'degree'. If 'use_normalization' is # set to 'true' then kernel matrix will be normalized by the square roots # of the diagonal entries. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Poly print('Poly') degree <- 4 inhomogene <- FALSE use_normalization <- TRUE dump <- sg('set_kernel', 'POLY', 'REAL', size_cache, degree, inhomogene, use_normalization) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# This is an example for the initialization of the PolyMatchString kernel on string data. # The PolyMatchString kernel sums over the matches of two stings of the same length and # takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Poly Match String print('PolyMatchString') degree <- 3 inhomogene <- FALSE dump <- sg('set_kernel', 'POLYMATCH', 'CHAR', size_cache, degree, inhomogene) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# The standard Sigmoid kernel computed on dense real valued features. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) # Sigmoid print('Sigmoid') gamma <- 1.2 coef0 <- 1.3 dump <- sg('set_kernel', 'SIGMOID', 'REAL', size_cache, gamma, coef0) dump <- sg('set_features', 'TRAIN', fm_train_real) km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) km <- sg('get_kernel_matrix', 'TEST')
# SimpleLocalityImprovedString kernel, is a ``simplified'' and better performing version of the Locality improved kernel. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Simple Locality Improved String print('SimpleLocalityImprovedString') length <- 5 inner_degree <- 5 outer_degree <- inner_degree+2 dump <- sg('set_kernel', 'SLIK', 'CHAR', size_cache, length, inner_degree, outer_degree) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# The CommUlongString kernel may be used to compute the spectrum kernel from strings that have been mapped into unsigned 64bit integers. # These 64bit integers correspond to k-mers. To be applicable in this kernel they need to be sorted (e.g. via the SortUlongString pre-processor). # It basically uses the algorithm in the unix "comm" command (hence the name) to compute the kernel function. # In this feature vector each entry denotes how often the k-mer appears in that . Note that this representation enables spectrum kernels of # order 8 for 8bit alphabets (like binaries) and order 32 for 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented # (though there is room for improvement here when a whole set of sequences is ADDed) using sorted lists. library(sg) traindat = c("AGTAA", "CGCCC", "GGCGG", "TGTCT") trainlab <- c(1,-1,-1,1) testdat = c("AGCAA", "CCCCC", "GGGGG", "TGCTT") order = 2 C = 1.0 sg('loglevel', 'ALL') sg('use_linadd', TRUE) sg('mkl_parameters', 1e-5, 0) sg('svm_epsilon', 1e-4) sg('clean_features', 'TRAIN') sg('clean_kernel') sg('set_features', 'TRAIN', traindat, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1) sg('add_preproc', 'SORTWORDSTRING') sg('attach_preproc', 'TRAIN') sg('set_labels', 'TRAIN', trainlab) sg('new_classifier', 'SVMLIGHT') sg('set_kernel', 'COMMSTRING', 'WORD', 10, TRUE, 'FULL') sg('c', C) km=sg('get_kernel_matrix', 'TRAIN') sg('train_classifier') svmAsList=sg('get_svm') sg('set_features', 'TEST', testdat, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1) sg('attach_preproc', 'TEST') sg('init_kernel_optimization') valout=sg('classify')
# The WeightedCommWordString kernel may be used to compute the weighted # spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer # length is weighted by some coefficient \f$\beta_k\f$) from strings that have # been mapped into unsigned 16bit integers. # # These 16bit integers correspond to k-mers. To applicable in this kernel they # need to be sorted (e.g. via the SortWordString pre-processor). # # It basically uses the algorithm in the unix "comm" command (hence the name) # to compute: # # k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'}) # # where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in # \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature # vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$. # # Note that this representation is especially tuned to small alphabets # (like the 2-bit alphabet DNA), for which it enables spectrum kernels # of order 8. # # For this kernel the linadd speedups are quite efficiently implemented using # direct maps. # library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' use_sign <- FALSE normalization <- 'FULL' # Weighted Comm Word String print('WeightedCommWordString') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', size_cache, use_sign, normalization) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts). # # The WD-shift kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d, weighted by coefficients beta_k # allowing for a positional tolerance of up to shift s. # library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Weighted Degree Position String print('WeightedDegreePositionString') degree <- 20 dump <- sg('set_kernel', 'WEIGHTEDDEGREEPOS', 'CHAR', size_cache, degree) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# The Weighted Degree String kernel. # # The WD kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d , weighted by coefficients beta_k. It # is defined as # # k(X, Y)=\sum_{k=1}^d\beta_k\sum_{l=1}^{L-k+1}I(u_{k,l}(X)=u_{k,l}(Y)). # # Here, $u_{k,l}(X)$ is the string of length k starting at position # l of the sequence X and I(.) is the indicator function # which evaluates to 1 when its argument is true and to 0 # otherwise. # library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # Weighted Degree String print('WeightedDegreeString') degree <- 20 dump <- sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree) dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') km <- sg('get_kernel_matrix', 'TEST')
# This script should enable you to rerun the experiment in the # paper that we labeled with "christmas star". # # The task is to classify two star-shaped classes that share the # midpoint. The difficulty of the learning problem depends on the # distance between the classes, which is varied # # Our model selection leads to a choice of C <- 0.5. The model # selection is not repeated inside this script. library(sg) # Preliminary settings: C <- 0.5 # SVM Parameter cache_size <- 50 # cache per kernel in MB svm_eps<-1e-3 # svm epsilon mkl_eps<-1e-3 # mkl epsilon no_obs <- 20 # number of observations / data points (sum for train and test and both classes) k_star <- 20 # number of "leaves" of the stars alpha <- 0.3 # noise level of the data radius_star <- matrix(0, length(seq(4.1, 10, 0.2)), 2) radius_star[,1] <- seq(4.1, 10, 0.2) # increasing radius of the 1.class radius_star[,2] <- matrix(4, length(radius_star[,1]),1) # fixed radius 2.class # distanz between the classes: diff(radius_star(:,1)-radius_star(:,2)) rbf_width <- c(0.01, 0.1, 1, 10, 1000) # different width for the five used rbf kernels #### #### Great loop: train MKL for every data set (the different distances between the stars) #### sg('loglevel', 'ERROR') sg('echo', 'OFF') w = matrix(0, length(1:dim(radius_star)[1]), length(rbf_width)) result.trainout=matrix(0, length(1:dim(radius_star)[1]), 2*no_obs) result.testout=matrix(0, length(1:dim(radius_star)[1]), 2*no_obs) result.trainerr=matrix(0,length(rbf_width), 1) result.testerr=matrix(0,length(rbf_width), 1) for (kk in 1:dim(radius_star)[1]) { # data generation print(sprintf('MKL for radius %+02.2f ', radius_star[kk,1])) dummy <- matrix(0, 2, 4*no_obs) dummy[1,] <- runif(4*no_obs) noise <- alpha*rnorm(4*no_obs) dummy[2,] <- sin(k_star*pi*dummy[1,]) + noise # sine dummy[2,1:(2*no_obs)] <- dummy[2,1:(2*no_obs)]+ radius_star[kk,1] # distanz shift: first class dummy[2,(2*no_obs+1):dim(dummy)[2]] <- dummy[2,(2*no_obs+1):dim(dummy)[2]]+ radius_star[kk,2] # distanz shift: second class dummy[1,] <- 2*pi*dummy[1,] x <- matrix(0, dim(dummy)[1], dim(dummy)[2]) x[1,] <- dummy[2,]*sin(dummy[1,]) x[2,] <- dummy[2,]*cos(dummy[1,]) train_y <- c(-matrix(1,1, no_obs), matrix(1,1,no_obs)) test_y <- c(-matrix(1,1, no_obs), matrix(1,1,no_obs)) train_x <- matrix(0, 0, seq(1,dim(x)[2]/2)) train_x <- x[,seq(1,dim(x)[2],2)] test_x <- x[,seq(2,dim(x)[2],2)] rm('dummy', 'x') # train MKL sg('clean_kernel') sg('clean_features', 'TRAIN') sg('add_features','TRAIN', train_x) # set a trainingset for every SVM sg('add_features','TRAIN', train_x) sg('add_features','TRAIN', train_x) sg('add_features','TRAIN', train_x) sg('add_features','TRAIN', train_x) sg('set_labels','TRAIN', train_y) # set the labels sg('new_classifier', 'MKL_CLASSIFICATION') sg('mkl_parameters', mkl_eps, 0) sg('svm_epsilon', svm_eps) sg('set_kernel', 'COMBINED', 0) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5]) sg('c', C) sg('train_classifier') alphas <- sg('get_svm')[2] w[kk,] <- sg('get_subkernel_weights') # calculate train error sg('clean_features', 'TEST') sg('add_features','TEST',train_x) sg('add_features','TEST',train_x) sg('add_features','TEST',train_x) sg('add_features','TEST',train_x) sg('add_features','TEST',train_x) sg('set_labels','TEST', train_y) sg('set_threshold', 0) result.trainout[kk,]<-sg('classify') result.trainerr[kk] <- mean(train_y!=sign(result.trainout[kk,])) # calculate test error sg('clean_features', 'TEST') sg('add_features','TEST',test_x) sg('add_features','TEST',test_x) sg('add_features','TEST',test_x) sg('add_features','TEST',test_x) sg('add_features','TEST',test_x) sg('set_labels','TEST',test_y) sg('set_threshold', 0) result.testout[kk,]<-sg('classify') result.testerr[kk] <- mean(test_y!=sign(result.testout[kk,])) } cat('done. now w contains the kernel weightings and result test/train outputs and errors')
library("sg") size_cache <- 10 C <- 1.2 epsilon <- 1e-5 mkl_eps <- 0.01 mkl_norm <- 1.5 width <- 1.2 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train_multiclass <- as.double(as.matrix(read.table('../data/label_train_multiclass.dat'))) # MKL_MULTICLASS print('MKL_MULTICLASS') dump <- sg('clean_features', 'TRAIN') dump <- sg('clean_features', 'TEST') dump <- sg('set_kernel', 'COMBINED', size_cache) dump <- sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) dump <- sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) dump <- sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 2) dump <- sg('add_features', 'TRAIN', fm_train_real) dump <- sg('add_features', 'TEST', fm_test_real) dump <- sg('set_labels', 'TRAIN', label_train_multiclass) dump <- sg('new_classifier', 'MKL_MULTICLASS') dump <- sg('svm_epsilon', epsilon) dump <- sg('c', C) dump <- sg('mkl_parameters', mkl_eps, 0, mkl_norm); dump <- sg('train_classifier') result <- sg('classify')
# This script should enable you to rerun the experiment in the # paper that we labeled "mixture linear and sine ". # # The task is to learn a regression function where the true function # is given by a mixture of 2 sine waves in addition to a linear trend. # We vary the frequency of the second higher frequency sine wave. # Setup: MKL on 10 RBF kernels of different widths on 1000 examples #load shogun library(sg) # kernel width for 10 basic SVMs rbf_width <- array(0.0, dim<-c(1,10)) rbf_width[1] <- 0.001 rbf_width[2] <- 0.005 rbf_width[3] <- 0.01 rbf_width[4] <- 0.05 rbf_width[5] <- 0.1 rbf_width[6] <- 1 rbf_width[7] <- 10 rbf_width[8] <- 50 rbf_width[9] <- 100 rbf_width[10] <- 1000 # SVM parameter C <- 1 cache_size <- 50 mkl_eps <- 1e-4 svm_eps <- 1e-4 svm_tube <- 0.01 debug <- 0 # data f <- c(0:20) # parameter that varies the frequency of the second sine wave #sg('loglevel', 'ALL') #sg('echo', 'ON') weights <- array(dim<-c(21,10)) no_obs <- 10 # number of observations stepsize <- (4*pi)/(no_obs-1) train_x <- c(0:(no_obs-1)) for (i in c(1:no_obs)) { train_x[i] <- train_x[i] * stepsize } trend <- 2 * train_x* ((pi)/(max(train_x)-min(train_x))) wave1 <- sin(train_x) wave2 <- sin(f[1]*train_x) train_y <- trend + wave1 + wave2 train_x<-matrix(train_x,1, length(train_x)) weights=matrix(0, length(f), length(rbf_width)) for (kk in c(1:length(f))) { #Big loop #data generation wave1 <- sin(train_x) wave2 <- sin(f[kk]*train_x) train_y <- trend + wave1 + wave2 #MK Learning sg('new_classifier', 'MKL_REGRESSION') sg('mkl_parameters', mkl_eps, 0) sg('c', C) sg('svm_epsilon', svm_eps) sg('svr_tube_epsilon', svm_tube) sg('clean_features', 'TRAIN') sg('clean_kernel') sg('set_labels', 'TRAIN', train_y) #set labels sg('add_features', 'TRAIN', train_x) #add features for every basic SVM sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('set_kernel', 'COMBINED', 0) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[6]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[7]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[8]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[9]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[10]) sg('train_classifier') weights[kk,] <- sg('get_subkernel_weights') cat("frequency:", f[kk], " rbf-kernel-weights: ", weights[kk,], "\n") }
# This script should enable you to rerun the experiment in the # paper that we labeled "sine". # # In this regression task a sine wave is to be learned. # We vary the frequency of the wave. # Preliminary settings: library(sg) # Parameter for the SVMs. C <- 10 # obtained via model selection (not included in the script) cache_size <- 10 mkl_norm <- 2 mkl_eps <- 1e-3 # threshold for precision svm_eps <- 1e-3 svr_tube_eps <- 1e-2 debug <- 0 # Kernel width for the 5 "basic" SVMs rbf_width <- c(0.005, 0.05, 0.5, 1, 10) # data f <- c(0.1:0.2:5) # values for the different frequencies no_obs <- 10 # number of observations if (debug) { sg('loglevel', 'ALL'); sg('echo', 'ON'); } else { sg('loglevel', 'ERROR'); sg('echo', 'OFF') } weights=matrix(0, length(f), length(rbf_width)) for (kk in 1:length(f)) { # big loop for the different learning problems # data generation train_x <- seq(1,10*2*pi, (((10*2*pi)-1)/(no_obs-1))) train_y <- sin(f[kk]*train_x) train_x <- matrix(train_x, 1, length(train_x)) # initialize MKL-SVR sg('new_classifier', 'MKL_REGRESSION') sg('mkl_parameters', mkl_eps, 0, mkl_norm) sg('c', C) sg('svm_epsilon', svm_eps) sg('svr_tube_epsilon', svr_tube_eps) sg('clean_features', 'TRAIN') sg('clean_kernel') sg('set_labels', 'TRAIN', train_y) # set labels sg('add_features', 'TRAIN', train_x) # add features for every SVR sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('add_features', 'TRAIN', train_x) sg('set_kernel', 'COMBINED', 0) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4]) sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5]) sg('svm_train') weights[kk,] <- sg('get_subkernel_weights') dummy <- print(sprintf('frequency: %02.2f rbf-kernel-weights: %02.2f %02.2f %02.2f %02.2f %02.2f', f[kk], weights[kk,1], weights[kk,2], weights[kk,3], weights[kk,4], weights[kk,5])) }
library(sg) acgt <- c("A","C","G","T") LT=sign(rnorm(1000)) XT= array("",dim=c(100,1000)) for (i in 1:length(XT)) { XT[i] = acgt[ceiling(4 * (rnorm(1) %% 1))] } for (k in c(30,60,61)) { for (i in 1:length(XT[k,])) { if (LT[i] == 1) { XT[k,i] = "A" } } } idx=sample(c(1:1000)) XTE=XT[,idx[1:200]] LTE=LT[idx[1:200]] XT=XT[,idx[201:1000]] LT=LT[idx[201:1000]] center_idx = 50 degree=3 mismatch = 0 C=1 #sg('loglevel', 'ALL') sg('use_linadd', TRUE) sg('mkl_parameters', 1e-5, 1) sg('svm_epsilon', 1e-6) sg('clean_features', 'TRAIN') sg('clean_kernel') sg('new_classifier', 'MKL_CLASSIFICATION') sg('set_labels', 'TRAIN', LT) sg('set_features', 'TRAIN', XT, 'DNA') sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', 10, degree, mismatch, FALSE, 1) sg('c', C) sg('svm_train') svmAsList=sg('get_svm') beta=sg('get_subkernel_weights') sg('init_kernel_optimization') sg('clean_features', 'TEST') sg('set_features', 'TEST', XTE, 'DNA') output_xte = sg('classify') w=sg('get_subkernel_weights') err=mean(sign(output_xte)!=LTE)
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # LogPlusOne adds one to a dense real-valued vector and takes the logarithm of # each component of it. It is most useful in situations where the inputs are # counts: When one compares differences of small counts any difference may matter # a lot, while small differences in large counts don't. This is what this log # transformation controls for. library("sg") size_cache <- 10 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) width <- 1.4 # LogPlusOne print('LogPlusOne') dump <- sg('add_preproc', 'LOGPLUSONE') dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # NormOne, normalizes vectors to have norm 1. library("sg") size_cache <- 10 width <- 2.1 fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat'))) fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) # NormOne print('NormOne') dump <- sg('add_preproc', 'NORMONE') dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # PruneVarSubMean substracts the mean from each feature and removes features that # have zero variance. library("sg") size_cache <- 10 width <- 2.1 fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat')) fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat')) # PruneVarSubMean print('PruneVarSubMean') divide_by_std <- TRUE dump <- sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std) dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width) dump <- sg('set_features', 'TRAIN', fm_train_real) dump <- sg('attach_preproc', 'TRAIN') km <- sg('get_kernel_matrix', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_real) dump <- sg('attach_preproc', 'TEST') km <- sg('get_kernel_matrix', 'TEST')
# In this example a kernel matrix is computed for a given string data set. The # CommUlongString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 64bit integers. These 64bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortUlongString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation enables # spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for # 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented # (though there is room for improvement here when a whole set of sequences is # ADDed) using sorted lists. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' use_sign <- FALSE normalization <- 'FULL' # Comm Ulong String print('CommUlongString') dump <- sg('add_preproc', 'SORTULONGSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') dump <- sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization) km <- sg('get_kernel_matrix', 'TRAIN') km <- sg('get_kernel_matrix', 'TEST')
# In this example a kernel matrix is computed for a given string data set. The # CommWordString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 16bit integers. These 16bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortWordString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation is # especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it # enables spectrum kernels of order up to 8. For this kernel the linadd speedups # are quite efficiently implemented using direct maps. library("sg") size_cache <- 10 fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat')) fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat')) order <- 3 gap <- 0 reverse <- 'n' use_sign <- FALSE normalization <- 'FULL' # Comm Word String print('CommWordString') dump <- sg('add_preproc', 'SORTWORDSTRING') dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA') dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TRAIN') dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA') dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) dump <- sg('attach_preproc', 'TEST') dump <- sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization) km <- sg('get_kernel_matrix', 'TRAIN') km <- sg('get_kernel_matrix', 'TEST')
# In this example a kernelized version of ridge regression (KRR) is trained on a # real-valued data set. The KRR is trained with regularization parameter tau=1e-6 # and a gaussian kernel with width=0.8. library("sg") size_cache <- 10 C <- 10 tube_epsilon <- 1e-2 width <- 2.1 fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train <- as.double(as.matrix(read.table('../data/label_train_regression.dat'))) # KRR print('KRR') tau <- 1e-6 dump <- sg('set_features', 'TRAIN', fm_train) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train) dump <- sg('new_regression', 'KERNELRIDGEREGRESSION') dump <- sg('krr_tau', tau) dump <- sg('c', C) dump <- sg('train_regression') dump <- sg('set_features', 'TEST', fm_test) result <- sg('classify')
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . library("sg") size_cache <- 10 C <- 10 tube_epsilon <- 1e-2 width <- 2.1 fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat'))) fm_test <- t(as.matrix(read.table('../data/fm_test_real.dat'))) label_train <- as.double(as.matrix(read.table('../data/label_train_regression.dat'))) # LibSVR print('LibSVR') dump <- sg('set_features', 'TRAIN', fm_train) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train) dump <- sg('new_regression', 'LIBSVR') dump <- sg('svr_tube_epsilon', tube_epsilon) dump <- sg('c', C) dump <- sg('train_regression') dump <- sg('set_features', 'TEST', fm_test) result <- sg('classify')
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. library("sg") size_cache <- 10 C <- 10 tube_epsilon <- 1e-2 width <- 2.1 fm_train <- as.matrix(read.table('../data/fm_train_real.dat')) fm_test <- as.matrix(read.table('../data/fm_test_real.dat')) label_train <- as.double(as.matrix(read.table('../data/label_train_twoclass.dat'))) # SVR Light dosvrlight <- function() { print('SVRLight') dump <- sg('set_features', 'TRAIN', fm_train) dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) dump <- sg('set_labels', 'TRAIN', label_train) dump <- sg('new_regression', 'SVRLIGHT') dump <- sg('svr_tube_epsilon', tube_epsilon) dump <- sg('c', C) dump <- sg('train_regression') dump <- sg('set_features', 'TEST', fm_test) result <- sg('classify') } try(dosvrlight())