SHOGUN: Examples for Static R Interface

This page lists ready to run shogun examples for the Static R interface.

To run the examples issue

R -f name_of_example.R

or start R and then type

source('name_of_example.R')

Classifier

../examples/documented/r/classifier_gmnpsvm.R

# In this example a multi-class support vector machine is trained on a toy data
# set and the trained classifier is used to predict labels of test examples. 
# The training algorithm is based on BSVM formulation (L2-soft margin
# and the bias added to the objective function) which is solved by the Improved
# Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian
# kernel of width 2.1 and the regularization constant C=1.2. The bias term of the
# classification rule is not used. The solver stops if the relative duality gap
# falls below 1e-5 and it uses 10MB for kernel cache.
# 
# For more details on the used SVM solver see 
#  V.Franc: Optimization Algorithms for Kernel Methods. Research report.
#  CTU-CMP-2005-22. CTU FEL Prague. 2005.
#  ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf .
# 

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_multiclass <- as.real(as.matrix(read.table('../data/label_train_multiclass.dat')))

# GMNPSVM
print('GMNPSVM')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train_multiclass)
dump <- sg('new_classifier', 'GMNPSVM')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_gpbtsvm.R

# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm Gradient Projection Decomposition Technique
# (GPDT) is used with SVM regularization parameter C=1.2 and a Gaussian
# kernel of width 2.1 and 10MB of kernel cache. 
# 
# For more details on GPDT solver see http://dm.unife.it/gpdt 
#  
#    

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_twoclass <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# GPBTSVM
print('GPBTSVM')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train_twoclass)
dump <- sg('new_classifier', 'GPBTSVM')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_knn.R

# This example shows usage of a k-nearest neighbor (KNN) classification rule on
# a toy data set. The number of the nearest neighbors is set to k=3 and the distances
# are measured by the Euclidean metric. Finally, the KNN rule is applied to predict
# labels of test examples. 

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_multiclass <- as.real(as.matrix(read.table('../data/label_train_multiclass.dat')))

# KNN
print('KNN')
k <- 3

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_labels', 'TRAIN', label_train_multiclass)
dump <- sg('set_distance', 'EUCLIDIAN', 'REAL')
dump <- sg('new_classifier', 'KNN')
dump <- sg('train_classifier', k)

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_lda.R

# In this example a linear two-class classifier is trained based on the Linear 
# Discriminant Analysis (LDA) from a toy 2-dimensional examples. The trained 
# LDA classifier is used to predict test examples. Note that the LDA classifier
# is optimal under the assumption that both classes are Gaussian distributed with equal
# co-variance. For more details on the LDA see e.g.
#              http://en.wikipedia.org/wiki/Linear_discriminant_analysis
# 

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_twoclass <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# LDA
print('LDA')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_labels', 'TRAIN', label_train_twoclass)
dump <- sg('new_classifier', 'LDA')
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_libsvm.R

# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm LIBSVM is used with SVM regularization
# parameter C=1 and a Gaussian kernel of width 1.2 and 10MB of kernel cache and 
# the precision parameter epsilon=1e-5.
# 
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ 

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_twoclass <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# LibSVM
print('LibSVM')

width <- 2.1

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train_twoclass)
dump <- sg('new_classifier', 'LIBSVM')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_libsvmmulticlass.R

# In this example a multi-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm LIBSVM is used with SVM regularization
# parameter C=1.2 and the bias in the classification rule switched off and 
# a Gaussian kernel of width 2.1 and 10MB of kernel cache and the precision 
# parameter epsilon=1e-5.
# 
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ 

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_multiclass <- as.real(as.matrix(read.table('../data/label_train_multiclass.dat')))


# LibSVM MultiClass
print('LibSVMMultiClass')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train_multiclass)
dump <- sg('new_classifier', 'LIBSVM_MULTICLASS')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_libsvmoneclass.R

# In this example a one-class support vector machine classifier is trained on a
# toy data set. The training algorithm finds a hyperplane in the RKHS which
# separates the training data from the origin. The one-class classifier is
# typically used to estimate the support of a high-dimesnional distribution. 
# For more details see e.g. 
#   B. Schoelkopf et al. Estimating the support of a high-dimensional
#   distribution. Neural Computation, 13, 2001, 1443-1471. 
# 
# In the example, the one-class SVM is trained by the LIBSVM solver with the
# regularization parameter C=1.2 and the Gaussian kernel of width 2.1 and the
# precision parameter epsilon=1e-5 and 10MB of the kernel cache.
# 
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ .
# 
# 

library("sg")

size_cache <- 10
svm_nu <- 0.1
epsilon <- 1e-5
use_bias <- TRUE
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# LibSVMOneClass
print('LibSVMOneClass')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)
dump <- sg('new_classifier', 'LIBSVM_ONECLASS')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('svm_nu', svm_nu)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_mpdsvm.R

# In this example a two-class support vector machine classifier is trained on a
# toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Minimal Primal Dual SVM is used with SVM
# regularization parameter C=1.2 and a Gaussian kernel of width 2.1 and 10MB of
# kernel cache and the precision parameter epsilon=1e-5.
# 
# For more details on the MPD solver see 
#  Kienzle, W. and B. Sch�lkopf: Training Support Vector Machines with Multiple
#  Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell,
#  J. G., J. Siekmann, Springer, Berlin, Germany (11 2005)

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_twoclass <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))


# MPDSVM
print('MPDSVM')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train_twoclass)
dump <- sg('new_classifier', 'MPDSVM')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('svm_use_bias', use_bias)
dump <- sg('train_classifier')

dump <- sg('set_features', 'TEST', fm_test_real)
result <- sg('classify')

../examples/documented/r/classifier_perceptron.R

# This example shows how to use the Perceptron algorithm for training a
# two-class linear classifier, i.e.  y = sign( <x,w>+b). The Perceptron algorithm
# works by iteratively passing though the training examples and applying the
# update rule on those examples which are misclassified by the current
# classifier. The Perceptron update rule reads
# 
#   w(t+1) = w(t) + alpha * y_t * x_t
#   b(t+1) = b(t) + alpha * y_t
# 
# where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example
#       (w(t),b(t)) are the current parameters of the linear classifier
#       (w(t+1),b(t+1)) are the new parameters of the linear classifier
#       alpha is the learning rate. 
# 
# The Perceptron algorithm iterates until all training examples are correctly
# classified or the prescribed maximal number of iterations is reached. 
# 
# The learning rate and the maximal number of iterations can be set by
#   sg('set_perceptron_parameters', alpha, max_iter);
# 

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE

fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat'))
fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat'))
label_train_twoclass <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# Perceptron
print('Perceptron')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('set_labels', 'TRAIN', label_train_twoclass)
dump <- sg('new_classifier', 'PERCEPTRON')
# often does not converge
#dump <- sg('train_classifier')

#dump <- sg('set_features', 'TEST', fm_test_real)
#result <- sg('classify')

../examples/documented/r/classifier_svmlight.R

# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
# 
# For more details on the Weighted Degree kernel see
#  G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
#  spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. 

library("sg")

size_cache <- 10
C <- 10
epsilon <- 1e-5
use_bias <- TRUE

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))
label_train_dna <- as.real(as.matrix(read.table('../data/label_train_dna.dat')))
degree <- 20

# SVM Light
dosvmlight <- function()
{
	print('SVMLight')

	dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
	dump <- sg('set_kernel',  'WEIGHTEDDEGREE', 'CHAR', size_cache, degree)

	dump <- sg('set_labels', 'TRAIN', label_train_dna)

	dump <- sg('new_classifier', 'SVMLIGHT')
	dump <- sg('svm_epsilon', epsilon)
	dump <- sg('c', C)
	dump <- sg('svm_use_bias', use_bias)
	dump <- sg('train_classifier')

	dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
	result <- sg('classify')
}
try(dosvmlight())

Clustering

../examples/documented/r/clustering_hierarchical.R

# In this example an agglomerative hierarchical single linkage clustering method
# is used to cluster a given toy data set. Starting with each object being
# assigned to its own cluster clusters are iteratively merged. Here the clusters
# are merged that have the closest (minimum distance, here set via the Euclidean
# distance object) two elements.

library("sg")

fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat')))

# Hierarchical
print('Hierarchical')

merges=3

dump <- sg('set_features', 'TRAIN', fm_train)
dump <- sg('set_distance', 'EUCLIDIAN', 'REAL')
dump <- sg('new_clustering', 'HIERARCHICAL')
dump <- sg('train_clustering', merges)

result <- sg('get_clustering')
merge_distances <- result[[1]]
pairs <- result[[2]]

../examples/documented/r/clustering_kmeans.R

# In this example the k-means clustering method is used to cluster a given toy
# data set. In k-means clustering one tries to partition n observations into k
# clusters in which each observation belongs to the cluster with the nearest mean.
# The algorithm class constructor takes the number of clusters and a distance to
# be used as input. The distance used in this example is Euclidean distance.
# After training one can fetch the result of clustering by obtaining the cluster
# centers and their radiuses.

library("sg")

fm_train <- as.matrix(read.table('../data/fm_train_real.dat'))

# KMEANS
print('KMeans')

k <- 3
iter <- 1000

dump <- sg('set_distance', 'EUCLIDIAN', 'REAL')
dump <- sg('set_features', 'TRAIN', fm_train)
dump <- sg('new_clustering', 'KMEANS')
dump <- sg('train_clustering', k, iter)

result <- sg('get_clustering')
radi <- result[[1]]
centers <- result[[2]]

Distance

../examples/documented/r/distance_braycurtis.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL') 
# from different files and initializes the distance to 'BRAYCURTIS'. 
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by 
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and 
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between 
# these two matrices is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. 
# 
# For more details see doc/classshogun_1_1CBrayCurtisDistance.html.
# 
# Obviously, using the Bray Curtis distance is not limited to this showcase 
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# BrayCurtis Distance
print('BrayCurtisDistance')

dump <- sg('set_distance', 'BRAYCURTIS', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_canberra.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CANBERRA'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (dissimilarity ratio) matrix is 
# computed by 'get_distance_matrix'. 
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (dissimilarity ratio)
# matrix between these two data sets is computed by 'get_distance_matrix'. 
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and 
# target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CCanberraMetric.html.
# 
# Obviously, using the Canberra distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Canberra Metric
print('CanberraMetric')

dump <- sg('set_distance', 'CANBERRA', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_canberraword.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and 
# initializes the distance to 'CANBERRA' with feature type 'WORD'.
# 
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
# 
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by 
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to 
# a string representation suitable for the selected distance. The individual 
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance 
# matrix between these two data sets by 'get_distance_matrix', we have to 
# perform two preprocessing steps for input data 'TEST'. The method 'convert' 
# transforms the input data 'TEST' to a string representation suitable for 
# the selected distance. The individual strings are sorted in ascending order 
# after the execution of 'attach_preproc'. A pairwise distance matrix between 
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see 
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreProc.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CCanberraWordDistance.html.
# 
# Obviously, using the Canberra word distance is not limited to this showcase
# example.

library("sg")

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'


# Canberra Word Distance
print('CanberraWordDistance')

dump <- sg('set_distance', 'CANBERRA', 'WORD')
dump <- sg('add_preproc', 'SORTWORDSTRING')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_chebyshew.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CHEBYSHEW'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix (maximum of absolute feature
# dimension differences) is computed by 'get_distance_matrix'. 
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix (maximum 
# of absolute feature dimension differences) between these two data sets is 
# computed. 
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. 
# 
# For more details see doc/classshogun_1_1CChebyshewMetric.html.
# 
# Obviously, using the Chebyshew distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Chebyshew Metric
print('ChebyshewMetric')

dump <- sg('set_distance', 'CHEBYSHEW', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_chisquare.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'CHISQUARE'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by 
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two matrices is computed by 'get_distance_matrix'. 
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' 
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CChiSquareDistance.html.
# 
# Obviously, using the ChiSquare distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# ChiSquare Distance
print('ChiSquareDistance')

dump <- sg('set_distance', 'CHISQUARE', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_cosine.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'COSINE'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CCosineDistance.html.
# 
# Obviously, using the Cosine distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Cosine Distance
print('CosineDistance')

dump <- sg('set_distance', 'COSINE', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_euclidian.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'EUCLIDIAN'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CEuclidianDistance.html.
# 
# Obviously, using the Euclidian distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Euclidian Distance
print('EuclidianDistance')

dump <- sg('set_distance', 'EUCLIDIAN', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_geodesic.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'GEODESIC'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (shortest path on a sphere) matrix is 
# computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (shortest path on 
# a sphere) matrix between these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CGeodesicMetric.html.
# 
# Obviously, using the Geodesic distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Geodesic Metric
print('GeodesicMetric')

dump <- sg('set_distance', 'GEODESIC', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_hammingword.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and
# initializes the distance to 'HAMMING' with feature type 'WORD'.
# 
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
# 
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to
# a string representation suitable for the selected distance. The individual
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance
# matrix between these two data sets by 'get_distance_matrix', we have to
# perform two preprocessing steps for input data 'TEST'. The method 'convert'
# transforms the input data 'TEST' to a string representation suitable for
# the selected distance. The individual strings are sorted in ascending order
# after the execution of 'attach_preproc'. A pairwise distance matrix between
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreProc.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CHammingWordDistance.html.
# 
# Obviously, using the Hamming word distance is not limited to this showcase
# example.

library("sg")

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'

# Hamming Word Distance
print('HammingWordDistance')

dump <- sg('set_distance', 'HAMMING', 'WORD')
dump <- sg('add_preproc', 'SORTWORDSTRING')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_jensen.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'JENSEN'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (divergence measure based on the 
# Kullback-Leibler divergence) matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (divergence measure 
# based on the Kullback-Leibler divergence) matrix between these two data sets 
# is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CJensenMetric.html.
# 
# Obviously, using the Jensen-Shannon distance/divergence is not limited to 
# this showcase example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Jensen Metric
print('JensenMetric')

dump <- sg('set_distance', 'JENSEN', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_manhatten.R

# n approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'MANHATTAN'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (sum of absolute feature
# dimension differences) matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (sum of absolute 
# feature dimension differences) matrix between these two data sets is
# computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CManhattanMetric.html.
# 
# Obviously, using the Manhattan distance is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Manhattan Metric
print('ManhattanMetric')

dump <- sg('set_distance', 'MANHATTAN', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_manhattenword.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored data sets in 'STRING' representation
# (feature type 'CHAR' with alphabet 'DNA') from different files and
# initializes the distance to 'MANHATTAN' with feature type 'WORD'.
# 
# Data points in this example are defined by the transformation function
# 'convert' and the preprocessing step applied afterwards (defined by
# 'add_preproc' and preprocessor 'SORTWORDSTRING').
# 
# The target 'TRAIN' for 'set_features' controls the binding of the given
# data points. In order to compute a pairwise distance matrix by
# 'get_distance_matrix', we have to perform two preprocessing steps for
# input data 'TRAIN'. The method 'convert' transforms the input data to
# a string representation suitable for the selected distance. The individual
# strings are sorted in ascending order after the execution of 'attach_preproc'.
# A pairwise distance matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the binding of the given
# data points 'TRAIN' and 'TEST'. In order to compute a pairwise distance
# matrix between these two data sets by 'get_distance_matrix', we have to
# perform two preprocessing steps for input data 'TEST'. The method 'convert'
# transforms the input data 'TEST' to a string representation suitable for
# the selected distance. The individual strings are sorted in ascending order
# after the execution of 'attach_preproc'. A pairwise distance matrix between
# the data sets 'TRAIN' and 'TEST' is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see
# doc/classshogun_1_1CSortWordString.html,
# doc/classshogun_1_1CPreProc.html,
# doc/classshogun_1_1CStringFeatures.html (method obtain_from_char_features) and
# doc/classshogun_1_1CManhattanWordDistance.html.
# 
# Obviously, using the Manhattan word distance is not limited to this showcase
# example.

library("sg")

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'

# Manhattan Word Distance
print('ManhattanWordDistance')

dump <- sg('set_distance', 'MANHATTAN', 'WORD')
dump <- sg('add_preproc', 'SORTWORDSTRING')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_minkowski.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'MINKOWSKI' with
# norm 'k'. Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance matrix is computed by
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance matrix between
# these two data sets is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist.
# 
# For more details see doc/classshogun_1_1CMinkowskiMetric.html.
# 
# Obviously, using the Minkowski metric is not limited to this showcase
# example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Minkowski Metric
print('MinkowskiMetric')

k <- 3

dump <- sg('set_distance', 'MINKOWSKI', 'REAL', k)

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

../examples/documented/r/distance_tanimoto.R

# An approach as applied below, which shows the processing of input data
# from a file becomes a crucial factor for writing your own sample applications.
# This approach is just one example of what can be done using the distance
# functions provided by shogun.
# 
# First, you need to determine what type your data will be, because this
# will determine the distance function you can use.
# 
# This example loads two stored matrices of real values (feature type 'REAL')
# from different files and initializes the distance to 'TANIMOTO'.
# Each column of the matrices corresponds to one data point.
# 
# The target 'TRAIN' for 'set_features' controls the processing of the given
# data points, where a pairwise distance (extended Jaccard coefficient)
# matrix is computed by 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix' and
# target 'TRAIN'.
# 
# The target 'TEST' for 'set_features' controls the processing of the given
# data points 'TRAIN' and 'TEST', where a pairwise distance (extended
# Jaccard coefficient) matrix between these two data sets is computed by
# 'get_distance_matrix'.
# 
# The resulting distance matrix can be reaccessed by 'get_distance_matrix'
# and target 'TEST'. The 'TRAIN' distance matrix ceased to exist. 
# 
# For more details see doc/classshogun_1_1CTanimotoDistance.html.
# 
# Obviously, using the Tanimoto distance/coefficient is not limited to
# this showcase example.

library("sg")

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Tanimoto Distance
print('TanimotoDistance')

dump <- sg('set_distance', 'TANIMOTO', 'REAL')

dump <- sg('set_features', 'TRAIN', fm_train_real)
dm <- sg('get_distance_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dm <- sg('get_distance_matrix', 'TEST')

Distribution

../examples/documented/r/distribution_histogram.R

# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.

library("sg")

order <- 3
gap <- 0
reverse <- 'n'

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character')))


#
# distributions
#

# Histogram
print('Histogram')

#	sg('new_distribution', 'HISTOGRAM')
dump <- sg('add_preproc', 'SORTWORDSTRING')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')

#	sg('train_distribution')
#	histo=sg('get_histogram')

#	num_examples=11
#	num_param=sg('get_histogram_num_model_parameters')
#	for i in xrange(num_examples):
#		for j in xrange(num_param):
#			sg('get_log_derivative %d %d' % (j, i))

#	sg('get_log_likelihood')
#	sg('get_log_likelihood_sample')

../examples/documented/r/distribution_hmm.R

# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set.

library("sg")

order <- 3
gap <- 0
reverse <- 'n'

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character')))

# HMM
print('HMM')

N <- 3
M <- 6
order <- 1
hmms <- c()
liks <- c()

dump <- sg('set_features', 'TRAIN', fm_train_cube, 'CUBE')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order)

dump <- sg('new_hmm', N, M)
dump <- sg('bw')
hmm <- sg('get_hmm')

dump <- sg('new_hmm', N, M)
dump <- sg('set_hmm', hmm[[1]], hmm[[2]], hmm[[3]], hmm[[4]])
likelihood <- sg('hmm_likelihood')

../examples/documented/r/distribution_linearhmm.R

# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to
# the structure of the Markov chain it is very similar to a HMM with just one
# chain of connected hidden states - that is why we termed this linear HMM.

library("sg")

order <- 3
gap <- 0
reverse <- 'n'

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_train_cube <- as.matrix(read.table('../data/fm_train_cube.dat', colClasses=c('character')))

# Linear HMM
print('LinearHMM')

#	sg('new_distribution', 'LinearHMM')
dump <- sg('add_preproc', 'SORTWORDSTRING')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')

#	sg('train_distribution')
#	histo=sg('get_histogram')

#	num_examples=11
#	num_param=sg('get_histogram_num_model_parameters')
#	for i in xrange(num_examples):
#		for j in xrange(num_param):
#			sg('get_log_derivative %d %d' % (j, i))

#	sg('get_log_likelihood')
#	sg('get_log_likelihood_sample')

Kernel

../examples/documented/r/kernel_chi2.R

# This is an example for the initialization of the chi2-kernel on real data, where 
# each column of the matrices corresponds to one training/test example. 

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# CHI2
print('Chi2')

width <- 1.4
dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_combined.R

# This is an example for the initialization of a combined kernel, which is a weighted sum of 
# in this case three kernels on real valued data. The sub-kernel weights are all set to 1. 
# 

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Combined
print('Combined')

dump <- sg('clean_features', 'TRAIN')
dump <- sg('clean_features', 'TEST')
dump <- sg('set_kernel', 'COMBINED', size_cache)
dump <- sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)
dump <- sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, 1)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)
dump <- sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 3, FALSE)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)

km <- sg('get_kernel_matrix', 'TRAIN')

km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_commulongstring.R

# This is an example for the initialization of the CommUlongString-kernel. This kernel 
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used 
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted 
# only once. 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'
use_sign <- FALSE
normalization <- 'FULL'

# Comm Ulong String
print('CommUlongString')

dump <- sg('add_preproc', 'SORTULONGSTRING')
dump <- sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_commwordstring.R

# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel 
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used 
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted 
# only once. 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'
use_sign <- FALSE
normalization <- 'FULL'


# Comm Word String
print('CommWordString')

dump <- sg('add_preproc', 'SORTWORDSTRING')
dump <- sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_const.R

# The constant kernel gives a trivial kernel matrix with all entries set to the same value 
# defined by the argument 'c'. 
# 

library("sg")

size_cache <- 10

fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat'))
fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat'))

# Const
print('Const')

c <- 23.

dump <- sg('set_kernel', 'CONST', 'REAL', size_cache, c)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_diag.R

# This is an example for the initialization of the diag-kernel. 
# The diag kernel has all kernel matrix entries but those on 
# the main diagonal set to zero. 

library("sg")

size_cache <- 10

fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat'))
fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat'))

# Diag
print('Diag')

diag=23.
dump <- sg('set_kernel', 'DIAG', 'REAL', size_cache, diag)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_distance.R

# With the distance kernel one can use any of the following distance metrics: 
# MINKOWSKI MANHATTAN HAMMING CANBERRA CHEBYSHEW GEODESIC JENSEN CHISQUARE TANIMOTO COSINE BRAYCURTIS EUCLIDIAN

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Distance
print('Distance')

width=1.7
dump <- sg('set_distance', 'EUCLIDIAN', 'REAL')
dump <- sg('set_kernel', 'DISTANCE', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km=sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km=sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_fixeddegreestring.R

# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Fixed Degree String
print('FixedDegreeString')

degree <- 3

dump <- sg('set_kernel', 'FIXEDDEGREE', 'CHAR', size_cache, degree)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_gaussian.R

# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features.

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Gaussian
print('Gaussian')

width <- 1.9

dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_gaussianshift.R

# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel.
# The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step 
# size of the shifts and  max_shift is the maximal shift.

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# GaussianShift
print('GaussianShift')

width <- 1.8
max_shift <- 2
shift_step <- 1

dump <- sg('set_kernel', 'GAUSSIANSHIFT', 'REAL', size_cache, width, max_shift, shift_step)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_histogramword.R

# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains.

library("sg")

size_cache <- 10
order <- 3
gap <- 0
reverse <- 'n'

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))
label_train_dna <- as.real(as.matrix(read.table('../data/label_train_dna.dat')))

# PluginEstimate
print('PluginEstimate w/ HistogramWord')

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)

pseudo_pos <- 1e-1
pseudo_neg <- 1e-1

dump <- sg('new_plugin_estimator', pseudo_pos, pseudo_neg)
dump <- sg('set_labels', 'TRAIN', label_train_dna)
dump <- sg('train_estimator')

dump <- sg('set_kernel', 'HISTOGRAM', 'WORD', size_cache)
km <- sg('get_kernel_matrix', 'TRAIN')

# not supported yet
#	lab=sg('plugin_estimate_classify')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_linear.R

# This is an example for the initialization of a linear kernel on real valued 
# data using scaling factor 1.2. 

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Linear
print('Linear')

dump <- sg('set_kernel', 'LINEAR', 'REAL', size_cache)
dump <- sg('set_features', 'TRAIN', fm_train_real)

dump <- sg('set_kernel_normalization', 'SQRTDIAG')
km1 <- sg('get_kernel_matrix', 'TRAIN')
dump <- sg('set_kernel_normalization', 'AVGDIAG')
km2 <- sg('get_kernel_matrix', 'TRAIN')


#dump <- sg('set_features', 'TEST', fm_test_real)
#km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_linearstring.R

# This is an example for the initialization of a linear kernel on string data. The 
# strings are all of the same length and consist of the characters 'ACGT' corresponding 
# to the DNA-alphabet. Each column of the matrices of type char corresponds to 
# one training/test example.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Linear String
print('LinearString')

dump <- sg('set_kernel', 'LINEAR', 'CHAR', size_cache)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_localalignmentstring.R

# This is an example for the initialization of the local alignment kernel on 
# DNA sequences, where each column of the matrices of type char corresponds to 
# one training/test example. 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Local Alignment String
print('LocalAlignmentString')

dump <- sg('set_kernel', 'LOCALALIGNMENT', 'CHAR', size_cache)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_localityimprovedstring.R

# This example initializes the locality improved string kernel. The locality improved string 
# kernel is defined on sequences of the same length and inspects letters matching at 
# corresponding positions in both sequences. The kernel sums over all matches in windows of 
# length l and takes this sum to the power of 'inner_degree'. The sum over all these 
# terms along the sequence is taken to the power of 'outer_degree'. 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Locality Improved String
print('LocalityImprovedString')

length <- 5
inner_degree <- 5
outer_degree <- inner_degree+2

dump <- sg('set_kernel', 'LIK', 'CHAR', size_cache, length, inner_degree, outer_degree)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_oligostring.R

# This is an example initializing the oligo string kernel which takes distances 
# between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length 
# of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is 
# implemented for the DNA-alphabet 'ACGT'. 
#  

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Oligo String
print('OligoString')

k <- 3
width <- 1.2

dump <- sg('set_kernel', 'OLIGO', 'CHAR', size_cache, k, width)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_poly.R

# This example initializes the polynomial kernel with real data. 
# If variable 'inhomogene' is 'true' +1 is added to the scalar product 
# before taking it to the power of 'degree'. If 'use_normalization' is 
# set to 'true' then kernel matrix will be normalized by the square roots
# of the diagonal entries. 

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Poly
print('Poly')

degree <- 4
inhomogene <- FALSE
use_normalization <- TRUE

dump <- sg('set_kernel', 'POLY', 'REAL', size_cache, degree, inhomogene, use_normalization)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_polymatchstring.R

# This is an example for the initialization of the PolyMatchString kernel on string data. 
# The PolyMatchString kernel sums over the matches of two stings of the same length and 
# takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding 
# to the DNA-alphabet. Each column of the matrices of type char corresponds to 
# one training/test example.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Poly Match String
print('PolyMatchString')

degree <- 3
inhomogene <- FALSE

dump <- sg('set_kernel', 'POLYMATCH', 'CHAR', size_cache, degree, inhomogene)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_sigmoid.R

# The standard Sigmoid kernel computed on dense real valued features.

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

# Sigmoid
print('Sigmoid')

gamma <- 1.2
coef0 <- 1.3

dump <- sg('set_kernel', 'SIGMOID', 'REAL', size_cache, gamma, coef0)

dump <- sg('set_features', 'TRAIN', fm_train_real)
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_simplelocalityimprovedstring.R

# SimpleLocalityImprovedString kernel, is a ``simplified'' and better performing version of the Locality improved kernel.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Simple Locality Improved String
print('SimpleLocalityImprovedString')

length <- 5
inner_degree <- 5
outer_degree <- inner_degree+2

dump <- sg('set_kernel', 'SLIK', 'CHAR', size_cache, length, inner_degree, outer_degree)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_spectrum.R

# The CommUlongString kernel may be used to compute the spectrum kernel from strings that have been mapped into unsigned 64bit integers.
# These 64bit integers correspond to k-mers. To be applicable in this kernel they need to be sorted (e.g. via the SortUlongString pre-processor).
# It basically uses the algorithm in the unix "comm" command (hence the name) to compute the kernel function. 
# In this feature vector each entry denotes how often the k-mer appears in that . Note that this representation enables spectrum kernels of 
# order 8 for 8bit alphabets (like binaries) and order 32 for 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented 
# (though there is room for improvement here when a whole set of sequences is ADDed) using sorted lists.

library(sg)

traindat = c("AGTAA", "CGCCC", "GGCGG", "TGTCT")
trainlab <- c(1,-1,-1,1) 
testdat = c("AGCAA", "CCCCC", "GGGGG", "TGCTT")

order = 2 
C = 1.0 

sg('loglevel', 'ALL')
sg('use_linadd', TRUE)
sg('mkl_parameters', 1e-5, 0)
sg('svm_epsilon', 1e-4)
sg('clean_features', 'TRAIN')
sg('clean_kernel')
sg('set_features', 'TRAIN', traindat, 'DNA')
sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1)
sg('add_preproc', 'SORTWORDSTRING')
sg('attach_preproc', 'TRAIN')
sg('set_labels', 'TRAIN', trainlab)
sg('new_classifier', 'SVMLIGHT')
sg('set_kernel', 'COMMSTRING', 'WORD', 10, TRUE, 'FULL')
sg('c', C)
km=sg('get_kernel_matrix', 'TRAIN')
sg('train_classifier')
svmAsList=sg('get_svm')

sg('set_features', 'TEST', testdat, 'DNA')
sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1)
sg('attach_preproc', 'TEST')
sg('init_kernel_optimization')
valout=sg('classify')

../examples/documented/r/kernel_weightedcommwordstring.R

# The WeightedCommWordString kernel may be used to compute the weighted
# spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer
# length is weighted by some coefficient \f$\beta_k\f$) from strings that have
# been mapped into unsigned 16bit integers.
# 
# These 16bit integers correspond to k-mers. To applicable in this kernel they
# need to be sorted (e.g. via the SortWordString pre-processor).
# 
# It basically uses the algorithm in the unix "comm" command (hence the name)
# to compute:
# 
# k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'})
# 
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
# 
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
# 
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
# 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'
use_sign <- FALSE
normalization <- 'FULL'

# Weighted Comm Word String
print('WeightedCommWordString')

dump <- sg('add_preproc', 'SORTWORDSTRING')
dump <- sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', size_cache, use_sign, normalization)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_weighteddegreepositionstring.R

# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
# 
# The WD-shift kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d, weighted by coefficients beta_k
# allowing for a positional tolerance of up to shift s.
# 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Weighted Degree Position String
print('WeightedDegreePositionString')

degree <- 20

dump <- sg('set_kernel', 'WEIGHTEDDEGREEPOS', 'CHAR', size_cache, degree)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/kernel_weighteddegreestring.R

# The Weighted Degree String kernel.
# 
# The WD kernel of order d compares two sequences X and
# Y of length L by summing all contributions of k-mer matches of
# lengths k in 1...d , weighted by coefficients beta_k. It
# is defined as
# 
#     k(X, Y)=\sum_{k=1}^d\beta_k\sum_{l=1}^{L-k+1}I(u_{k,l}(X)=u_{k,l}(Y)).
# 
# Here, $u_{k,l}(X)$ is the string of length k starting at position
# l of the sequence X and I(.) is the indicator function
# which evaluates to 1 when its argument is true and to 0
# otherwise.
# 

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# Weighted Degree String
print('WeightedDegreeString')

degree <- 20

dump <- sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree)

dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
km <- sg('get_kernel_matrix', 'TEST')

Mkl

../examples/documented/r/mkl_classify_christmas_star.R

# This script should enable you to rerun the experiment in the
# paper that we labeled with "christmas star".
#
# The task is to classify two star-shaped classes that share the
# midpoint. The difficulty of the learning problem depends on the 
# distance between the classes, which is varied
# 
# Our model selection leads to a choice of C <- 0.5. The model 
# selection is not repeated inside this script.
library(sg)

# Preliminary settings:

C <- 0.5         # SVM Parameter
cache_size <- 50 # cache per kernel in MB
svm_eps<-1e-3	 # svm epsilon
mkl_eps<-1e-3	 # mkl epsilon

no_obs <- 20   # number of observations / data points (sum for train and test and both classes)
k_star <- 20     # number of "leaves" of the stars
alpha <- 0.3     # noise level of the data

radius_star <- matrix(0, length(seq(4.1, 10, 0.2)), 2)
radius_star[,1] <- seq(4.1, 10, 0.2)                      # increasing radius of the 1.class
radius_star[,2] <- matrix(4, length(radius_star[,1]),1)   # fixed radius 2.class
                                           # distanz between the classes: diff(radius_star(:,1)-radius_star(:,2))
rbf_width <- c(0.01, 0.1, 1, 10, 1000)     # different width for the five used rbf kernels


####
#### Great loop: train MKL for every data set (the different distances between the stars)
####

sg('loglevel', 'ERROR')
sg('echo', 'OFF')

w = matrix(0, length(1:dim(radius_star)[1]), length(rbf_width))

result.trainout=matrix(0, length(1:dim(radius_star)[1]), 2*no_obs)
result.testout=matrix(0, length(1:dim(radius_star)[1]), 2*no_obs)
result.trainerr=matrix(0,length(rbf_width), 1)
result.testerr=matrix(0,length(rbf_width), 1)

for (kk in 1:dim(radius_star)[1]) {
  # data generation
  print(sprintf('MKL for radius %+02.2f                                                      ', radius_star[kk,1]))

  dummy <- matrix(0, 2, 4*no_obs)
  dummy[1,] <- runif(4*no_obs)
  noise <- alpha*rnorm(4*no_obs)

  dummy[2,] <- sin(k_star*pi*dummy[1,]) + noise         # sine
  dummy[2,1:(2*no_obs)] <- dummy[2,1:(2*no_obs)]+ radius_star[kk,1]         # distanz shift: first class
  dummy[2,(2*no_obs+1):dim(dummy)[2]] <- dummy[2,(2*no_obs+1):dim(dummy)[2]]+ radius_star[kk,2] # distanz shift: second class   

  dummy[1,] <- 2*pi*dummy[1,]        

  x <- matrix(0, dim(dummy)[1], dim(dummy)[2])
  x[1,] <-  dummy[2,]*sin(dummy[1,])
  x[2,] <-  dummy[2,]*cos(dummy[1,])

  train_y <- c(-matrix(1,1, no_obs), matrix(1,1,no_obs))
  test_y <- c(-matrix(1,1, no_obs), matrix(1,1,no_obs))

  train_x <- matrix(0, 0, seq(1,dim(x)[2]/2))
  train_x <- x[,seq(1,dim(x)[2],2)]
  test_x  <- x[,seq(2,dim(x)[2],2)]

  rm('dummy', 'x')

  # train MKL

  sg('clean_kernel')
  sg('clean_features', 'TRAIN')
  sg('add_features','TRAIN', train_x)       # set a trainingset for every SVM
  sg('add_features','TRAIN', train_x)
  sg('add_features','TRAIN', train_x)
  sg('add_features','TRAIN', train_x)
  sg('add_features','TRAIN', train_x)
  sg('set_labels','TRAIN', train_y)         # set the labels
  sg('new_classifier', 'MKL_CLASSIFICATION')
  sg('mkl_parameters', mkl_eps, 0)
  sg('svm_epsilon', svm_eps)
  sg('set_kernel', 'COMBINED', 0)
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5])
  sg('c', C)
  sg('train_classifier')
  alphas <- sg('get_svm')[2]
  w[kk,] <- sg('get_subkernel_weights')

  # calculate train error
  sg('clean_features', 'TEST')
  sg('add_features','TEST',train_x)
  sg('add_features','TEST',train_x)
  sg('add_features','TEST',train_x)
  sg('add_features','TEST',train_x)
  sg('add_features','TEST',train_x)
  sg('set_labels','TEST', train_y)
  sg('set_threshold', 0)

  result.trainout[kk,]<-sg('classify')
  result.trainerr[kk]  <- mean(train_y!=sign(result.trainout[kk,]))  

  # calculate test error

  sg('clean_features', 'TEST')
  sg('add_features','TEST',test_x)
  sg('add_features','TEST',test_x)
  sg('add_features','TEST',test_x)
  sg('add_features','TEST',test_x)
  sg('add_features','TEST',test_x)
  sg('set_labels','TEST',test_y)
  sg('set_threshold', 0)
  result.testout[kk,]<-sg('classify')
  result.testerr[kk]  <- mean(test_y!=sign(result.testout[kk,]))    
}
cat('done. now w contains the kernel weightings and result test/train outputs and errors')

../examples/documented/r/mkl_multiclass.R

library("sg")

size_cache <- 10
C <- 1.2
epsilon <- 1e-5
mkl_eps <- 0.01 
mkl_norm <- 1.5

width <- 1.2

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train_multiclass <- as.real(as.matrix(read.table('../data/label_train_multiclass.dat')))

# MKL_MULTICLASS
print('MKL_MULTICLASS')

dump <- sg('clean_features', 'TRAIN')
dump <- sg('clean_features', 'TEST')
dump <- sg('set_kernel', 'COMBINED', size_cache)
dump <- sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)
dump <- sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, width)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)
dump <- sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 2)
dump <- sg('add_features', 'TRAIN', fm_train_real)
dump <- sg('add_features', 'TEST', fm_test_real)

dump <- sg('set_labels', 'TRAIN', label_train_multiclass)
dump <- sg('new_classifier', 'MKL_MULTICLASS')
dump <- sg('svm_epsilon', epsilon)
dump <- sg('c', C)
dump <- sg('mkl_parameters', mkl_eps, 0, mkl_norm);
dump <- sg('train_classifier')

result <- sg('classify')

../examples/documented/r/mkl_regression_3sine_waves.R

# This script should enable you to rerun the experiment in the
# paper that we labeled "mixture linear and sine ".
#
# The task is to learn a regression function where the true function
# is given by a mixture of 2 sine waves in addition to a linear trend. 
# We vary the frequency of the second higher frequency sine wave. 

# Setup: MKL on 10 RBF kernels of different widths on 1000 examples

#load  shogun
library(sg)

# kernel width for 10 basic SVMs
rbf_width <- array(0.0, dim<-c(1,10))
rbf_width[1] <- 0.001
rbf_width[2] <- 0.005
rbf_width[3] <- 0.01
rbf_width[4] <- 0.05
rbf_width[5] <- 0.1
rbf_width[6] <- 1
rbf_width[7] <- 10
rbf_width[8] <- 50
rbf_width[9] <- 100
rbf_width[10] <- 1000

# SVM parameter
C          <- 1
cache_size <- 50
mkl_eps    <- 1e-4
svm_eps    <- 1e-4
svm_tube   <- 0.01
debug <- 0

# data
f <- c(0:20)  # parameter that varies the frequency of the second sine wave

#sg('loglevel', 'ALL')
#sg('echo', 'ON')

weights <- array(dim<-c(21,10))

no_obs <- 10    # number of observations
stepsize <- (4*pi)/(no_obs-1)
train_x <- c(0:(no_obs-1))
for (i in c(1:no_obs)) {
   train_x[i] <- train_x[i] * stepsize
}

trend <- 2 * train_x* ((pi)/(max(train_x)-min(train_x)))
wave1 <- sin(train_x)
wave2 <- sin(f[1]*train_x)
train_y <- trend + wave1 + wave2

train_x<-matrix(train_x,1, length(train_x))

weights=matrix(0, length(f), length(rbf_width))

for (kk in c(1:length(f))) {  #Big loop

   #data generation
   wave1 <- sin(train_x)
   wave2 <- sin(f[kk]*train_x)
   train_y <- trend + wave1 + wave2

   #MK Learning
   sg('new_classifier', 'MKL_REGRESSION')
   sg('mkl_parameters', mkl_eps, 0)
   sg('c', C)
   sg('svm_epsilon', svm_eps)
   sg('svr_tube_epsilon', svm_tube)
   sg('clean_features', 'TRAIN')
   sg('clean_kernel')

   sg('set_labels', 'TRAIN', train_y)              #set labels
   sg('add_features', 'TRAIN', train_x)             #add features for every basic SVM
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('add_features', 'TRAIN', train_x)
   sg('set_kernel', 'COMBINED', 0)
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[6])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[7])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[8])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[9])
   sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[10])
   sg('train_classifier')

   weights[kk,] <- get_subkernel_weights()
   cat("frequency:", f[kk], " rbf-kernel-weights: ", weights[kk,], "\n")
}

../examples/documented/r/mkl_regression_sine.R

# This script should enable you to rerun the experiment in the
# paper that we labeled "sine".
#
# In this regression task a sine wave is to be learned.
# We vary the frequency of the wave. 

# Preliminary settings:
library(sg)

# Parameter for the SVMs.
C          <- 10        # obtained via model selection (not included in the script)
cache_size <- 10
mkl_eps    <- 1e-3  # threshold for precision
svm_eps    <- 1e-3
svr_tube_eps   <- 1e-2
debug          <- 0

# Kernel width for the 5 "basic" SVMs
rbf_width  <- c(0.005, 0.05, 0.5, 1, 10)

# data
f <- c(0.1:0.2:5)   # values for the different frequencies
no_obs <- 10     # number of observations

if (debug) {
	sg('loglevel', 'ALL');
	sg('echo', 'ON');
} else {
	sg('loglevel', 'ERROR');
	sg('echo', 'OFF')
}

weights=matrix(0, length(f), length(rbf_width))

for (kk in 1:length(f)) {   # big loop for the different learning problems

  # data generation
  train_x <- seq(1,10*2*pi, (((10*2*pi)-1)/(no_obs-1)))
  train_y <- sin(f[kk]*train_x)
  train_x <- matrix(train_x, 1, length(train_x))

  # initialize MKL-SVR
  sg('new_classifier', 'MKL_REGRESSION')
  sg('mkl_parameters', mkl_eps, 0)
  sg('c', C)                
  sg('svm_epsilon', svm_eps)
  sg('svr_tube_epsilon', svr_tube_eps)
  sg('clean_features', 'TRAIN')
  sg('clean_kernel')
  sg('set_labels', 'TRAIN', train_y)      # set labels
  sg('add_features', 'TRAIN', train_x)    # add features for every SVR
  sg('add_features', 'TRAIN', train_x)
  sg('add_features', 'TRAIN', train_x)
  sg('add_features', 'TRAIN', train_x)
  sg('add_features', 'TRAIN', train_x)
  sg('set_kernel', 'COMBINED', 0)
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[1])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[2])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[3])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[4])
  sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width[5])
  sg('svm_train')
  weights[kk,] <- sg('get_subkernel_weights')
  dummy <- print(sprintf('frequency: %02.2f   rbf-kernel-weights:  %02.2f %02.2f %02.2f %02.2f %02.2f',
			  f[kk], weights[kk,1], weights[kk,2], weights[kk,3], weights[kk,4], weights[kk,5]))
}

../examples/documented/r/mkl_wd.R

library(sg)

acgt <- c("A","C","G","T")
LT=sign(rnorm(1000))
XT= array("",dim=c(100,1000))

for (i in 1:length(XT)) {
   XT[i] = acgt[ceiling(4 * (rnorm(1) %% 1))]
}

for (k in c(30,60,61)) {
   for (i in 1:length(XT[k,])) {
      if (LT[i] == 1) {
         XT[k,i] = "A"
      }
   }
}

idx=sample(c(1:1000))
XTE=XT[,idx[1:200]]
LTE=LT[idx[1:200]]
XT=XT[,idx[201:1000]]
LT=LT[idx[201:1000]]

center_idx = 50
degree=3
mismatch = 0
C=1

#sg('loglevel', 'ALL')
sg('use_linadd', TRUE)
sg('mkl_parameters', 1e-5, 1)
sg('svm_epsilon', 1e-6)
sg('clean_features', 'TRAIN')
sg('clean_kernel')
sg('new_classifier', 'MKL_CLASSIFICATION')

sg('set_labels', 'TRAIN', LT)
sg('set_features', 'TRAIN', XT, 'DNA')
sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', 10, degree, mismatch, FALSE, 1)

sg('c', C)
sg('svm_train')

svmAsList=sg('get_svm')
beta=sg('get_subkernel_weights')

sg('init_kernel_optimization')

sg('clean_features', 'TEST')
sg('set_features', 'TEST', XTE, 'DNA')

output_xte = sg('classify')

w=sg('get_subkernel_weights')
err=mean(sign(output_xte)!=LTE)

Preproc

../examples/documented/r/preproc_logplusone.R

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# LogPlusOne adds one to a dense real-valued vector and takes the logarithm of
# each component of it. It is most useful in situations where the inputs are
# counts: When one compares differences of small counts any difference may matter
# a lot, while small differences in large counts don't. This is what this log
# transformation controls for.

library("sg")

size_cache <- 10

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))

width <- 1.4

# LogPlusOne
print('LogPlusOne')

dump <- sg('add_preproc', 'LOGPLUSONE')
dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/preproc_normone.R

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# NormOne, normalizes vectors to have norm 1.

library("sg")

size_cache <- 10
width <- 2.1

fm_train_real <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test_real <- t(as.matrix(read.table('../data/fm_test_real.dat')))
fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

# NormOne
print('NormOne')

dump <- sg('add_preproc', 'NORMONE')
dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/preproc_prunevarsubmean.R

# In this example a kernel matrix is computed for a given real-valued data set.
# The kernel used is the Chi2 kernel which operates on real-valued vectors. It
# computes the chi-squared distance between sets of histograms. It is a very
# useful distance in image recognition (used to detect objects). The preprocessor
# PruneVarSubMean substracts the mean from each feature and removes features that
# have zero variance.

library("sg")

size_cache <- 10
width <- 2.1

fm_train_real <- as.matrix(read.table('../data/fm_train_real.dat'))
fm_test_real <- as.matrix(read.table('../data/fm_test_real.dat'))

# PruneVarSubMean
print('PruneVarSubMean')

divide_by_std <- TRUE
dump <- sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std)
dump <- sg('set_kernel', 'CHI2', 'REAL', size_cache, width)

dump <- sg('set_features', 'TRAIN', fm_train_real)
dump <- sg('attach_preproc', 'TRAIN')
km <- sg('get_kernel_matrix', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_real)
dump <- sg('attach_preproc', 'TEST')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/preproc_sortulongstring.R

# In this example a kernel matrix is computed for a given string data set. The
# CommUlongString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 64bit integers. These 64bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortUlongString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation enables
# spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for
# 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented
# (though there is room for improvement here when a whole set of sequences is
# ADDed) using sorted lists.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'
use_sign <- FALSE
normalization <- 'FULL'

# Comm Ulong String
print('CommUlongString')

dump <- sg('add_preproc', 'SORTULONGSTRING')
dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')

dump <- sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization)
km <- sg('get_kernel_matrix', 'TRAIN')
km <- sg('get_kernel_matrix', 'TEST')

../examples/documented/r/preproc_sortwordstring.R

# In this example a kernel matrix is computed for a given string data set. The
# CommWordString kernel is used to compute the spectrum kernel from strings that
# have been mapped into unsigned 16bit integers. These 16bit integers correspond
# to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted.
# This is done using the SortWordString preprocessor, which sorts the indivual
# strings in ascending order. The kernel function basically uses the algorithm in
# the unix "comm" command (hence the name). Note that this representation is
# especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it
# enables spectrum kernels of order up to 8. For this kernel the linadd speedups
# are quite efficiently implemented using direct maps.

library("sg")

size_cache <- 10

fm_train_dna <- as.matrix(read.table('../data/fm_train_dna.dat'))
fm_test_dna <- as.matrix(read.table('../data/fm_test_dna.dat'))

order <- 3
gap <- 0
reverse <- 'n'
use_sign <- FALSE
normalization <- 'FULL'

# Comm Word String
print('CommWordString')

dump <- sg('add_preproc', 'SORTWORDSTRING')
dump <- sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
dump <- sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TRAIN')

dump <- sg('set_features', 'TEST', fm_test_dna, 'DNA')
dump <- sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
dump <- sg('attach_preproc', 'TEST')

dump <- sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization)
km <- sg('get_kernel_matrix', 'TRAIN')
km <- sg('get_kernel_matrix', 'TEST')

Regression

../examples/documented/r/regression_krr.R

# In this example a kernelized version of ridge regression (KRR) is trained on a
# real-valued data set. The KRR is trained with regularization parameter tau=1e-6
# and a gaussian kernel with width=0.8.

library("sg")

size_cache <- 10
C <- 10
tube_epsilon <- 1e-2
width <- 2.1

fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# KRR
print('KRR')

tau <- 1e-6

dump <- sg('set_features', 'TRAIN', fm_train)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train)

dump <- sg('new_regression', 'KRR')
dump <- sg('krr_tau', tau)
dump <- sg('c', C)
dump <- sg('train_regression')

dump <- sg('set_features', 'TEST', fm_test)
result <- sg('classify')

../examples/documented/r/regression_libsvr.R

# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1.
# 
# For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ .

library("sg")

size_cache <- 10
C <- 10
tube_epsilon <- 1e-2
width <- 2.1

fm_train <- t(as.matrix(read.table('../data/fm_train_real.dat')))
fm_test <- t(as.matrix(read.table('../data/fm_test_real.dat')))
label_train <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# LibSVR
print('LibSVR')

dump <- sg('set_features', 'TRAIN', fm_train)
dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

dump <- sg('set_labels', 'TRAIN', label_train)
dump <- sg('new_regression', 'LIBSVR')
dump <- sg('svr_tube_epsilon', tube_epsilon)
dump <- sg('c', C)
dump <- sg('train_regression')

dump <- sg('set_features', 'TEST', fm_test)
result <- sg('classify')

../examples/documented/r/regression_svrlight.R

# In this example a support vector regression algorithm is trained on a
# real-valued toy data set. The underlying library used for the SVR training is
# SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian
# kernel with width=2.1.
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.

library("sg")

size_cache <- 10
C <- 10
tube_epsilon <- 1e-2
width <- 2.1

fm_train <- as.matrix(read.table('../data/fm_train_real.dat'))
fm_test <- as.matrix(read.table('../data/fm_test_real.dat'))
label_train <- as.real(as.matrix(read.table('../data/label_train_twoclass.dat')))

# SVR Light
dosvrlight <- function()
{
	print('SVRLight')

	dump <- sg('set_features', 'TRAIN', fm_train)
	dump <- sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

	dump <- sg('set_labels', 'TRAIN', label_train)

	dump <- sg('new_regression', 'SVRLIGHT')
	dump <- sg('svr_tube_epsilon', tube_epsilon)
	dump <- sg('c', C)
	dump <- sg('train_regression')

	dump <- sg('set_features', 'TEST', fm_test)
	result <- sg('classify')
}
try(dosvrlight())