The CommWordString kernel may be used to compute the spectrum kernel from strings that have been mapped into unsigned 16bit integers.
These 16bit integers correspond to k-mers. To applicable in this kernel they need to be sorted (e.g. via the SortWordString pre-processor).
It basically uses the algorithm in the unix "comm" command (hence the name) to compute:
where maps a sequence
that consists of letters in
to a feature vector of size
. In this feature vector each entry denotes how often the k-mer appears in that
.
Note that this representation is especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it enables spectrum kernels of order up to 8.
For this kernel the linadd speedups are quite efficiently implemented using direct maps.
Definition at line 46 of file CommWordStringKernel.h.
Public Member Functions | |
CCommWordStringKernel () | |
CCommWordStringKernel (int32_t size, bool use_sign) | |
CCommWordStringKernel (CStringFeatures< uint16_t > *l, CStringFeatures< uint16_t > *r, bool use_sign=false, int32_t size=10) | |
virtual | ~CCommWordStringKernel () |
virtual bool | init (CFeatures *l, CFeatures *r) |
virtual void | cleanup () |
virtual EKernelType | get_kernel_type () |
virtual const char * | get_name () const |
virtual bool | init_dictionary (int32_t size) |
virtual bool | init_optimization (int32_t count, int32_t *IDX, float64_t *weights) |
virtual bool | delete_optimization () |
virtual float64_t | compute_optimized (int32_t idx) |
virtual void | add_to_normal (int32_t idx, float64_t weight) |
virtual void | clear_normal () |
virtual EFeatureType | get_feature_type () |
void | get_dictionary (int32_t &dsize, float64_t *&dweights) |
virtual float64_t * | compute_scoring (int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, bool do_init=true) |
char * | compute_consensus (int32_t &num_feat, int32_t num_suppvec, int32_t *IDX, float64_t *alphas) |
void | set_use_dict_diagonal_optimization (bool flag) |
bool | get_use_dict_diagonal_optimization () |
Protected Member Functions | |
virtual float64_t | compute (int32_t idx_a, int32_t idx_b) |
virtual float64_t | compute_helper (int32_t idx_a, int32_t idx_b, bool do_sort) |
virtual float64_t | compute_diag (int32_t idx_a) |
Protected Attributes | |
int32_t | dictionary_size |
float64_t * | dictionary_weights |
bool | use_sign |
bool | use_dict_diagonal_optimization |
int32_t * | dict_diagonal_optimization |
Friends | |
class | CVarianceKernelNormalizer |
class | CSqrtDiagKernelNormalizer |
class | CAvgDiagKernelNormalizer |
class | CRidgeKernelNormalizer |
class | CFirstElementKernelNormalizer |
class | CTanimotoKernelNormalizer |
class | CDiceKernelNormalizer |
default constructor
Definition at line 22 of file CommWordStringKernel.cpp.
CCommWordStringKernel | ( | int32_t | size, | |
bool | use_sign | |||
) |
constructor
size | cache size | |
use_sign | if sign shall be used |
Definition at line 28 of file CommWordStringKernel.cpp.
CCommWordStringKernel | ( | CStringFeatures< uint16_t > * | l, | |
CStringFeatures< uint16_t > * | r, | |||
bool | use_sign = false , |
|||
int32_t | size = 10 | |||
) |
constructor
l | features of left-hand side | |
r | features of right-hand side | |
use_sign | if sign shall be used | |
size | cache size |
Definition at line 35 of file CommWordStringKernel.cpp.
~CCommWordStringKernel | ( | ) | [virtual] |
Definition at line 57 of file CommWordStringKernel.cpp.
void add_to_normal | ( | int32_t | idx, | |
float64_t | weight | |||
) | [virtual] |
add to normal
idx | where to add | |
weight | what to add |
Reimplemented from CKernel.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 241 of file CommWordStringKernel.cpp.
void cleanup | ( | ) | [virtual] |
clean up kernel
Reimplemented from CKernel.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 79 of file CommWordStringKernel.cpp.
void clear_normal | ( | ) | [virtual] |
virtual float64_t compute | ( | int32_t | idx_a, | |
int32_t | idx_b | |||
) | [protected, virtual] |
compute kernel function for features a and b idx_{a,b} denote the index of the feature vectors in the corresponding feature object
idx_a | index a | |
idx_b | index b |
Implements CKernel.
Definition at line 215 of file CommWordStringKernel.h.
char * compute_consensus | ( | int32_t & | num_feat, | |
int32_t | num_suppvec, | |||
int32_t * | IDX, | |||
float64_t * | alphas | |||
) |
compute consensus
num_feat | number of features | |
num_suppvec | number of support vectors | |
IDX | IDX | |
alphas | alphas |
Definition at line 498 of file CommWordStringKernel.cpp.
float64_t compute_diag | ( | int32_t | idx_a | ) | [protected, virtual] |
helper to compute only diagonal normalization for training
idx_a | index a |
Definition at line 85 of file CommWordStringKernel.cpp.
float64_t compute_helper | ( | int32_t | idx_a, | |
int32_t | idx_b, | |||
bool | do_sort | |||
) | [protected, virtual] |
helper for compute
idx_a | index a | |
idx_b | index b | |
do_sort | if sorting shall be performed |
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 129 of file CommWordStringKernel.cpp.
float64_t compute_optimized | ( | int32_t | idx | ) | [virtual] |
compute optimized
idx | index to compute |
Reimplemented from CKernel.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 326 of file CommWordStringKernel.cpp.
float64_t * compute_scoring | ( | int32_t | max_degree, | |
int32_t & | num_feat, | |||
int32_t & | num_sym, | |||
float64_t * | target, | |||
int32_t | num_suppvec, | |||
int32_t * | IDX, | |||
float64_t * | alphas, | |||
bool | do_init = true | |||
) | [virtual] |
compute scoring
max_degree | maximum degree | |
num_feat | number of features | |
num_sym | number of symbols | |
target | target | |
num_suppvec | number of support vectors | |
IDX | IDX | |
alphas | alphas | |
do_init | if initialization shall be performed |
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 375 of file CommWordStringKernel.cpp.
bool delete_optimization | ( | ) | [virtual] |
delete optimization
Reimplemented from CKernel.
Definition at line 318 of file CommWordStringKernel.cpp.
void get_dictionary | ( | int32_t & | dsize, | |
float64_t *& | dweights | |||
) |
get dictionary
dsize | dictionary size will be stored in here | |
dweights | dictionary weights will be stored in here |
Definition at line 153 of file CommWordStringKernel.h.
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
return feature type the kernel can deal with
Reimplemented from CStringKernel< uint16_t >.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 146 of file CommWordStringKernel.h.
virtual EKernelType get_kernel_type | ( | ) | [virtual] |
return what type of kernel we are
Implements CStringKernel< uint16_t >.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 95 of file CommWordStringKernel.h.
virtual const char* get_name | ( | void | ) | const [virtual] |
return the kernel's name
Reimplemented from CStringKernel< uint16_t >.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 101 of file CommWordStringKernel.h.
bool get_use_dict_diagonal_optimization | ( | ) |
get.use.dict.diagonal.optimization
Definition at line 201 of file CommWordStringKernel.h.
initialize kernel
l | features of left-hand side | |
r | features of right-hand side |
Reimplemented from CStringKernel< uint16_t >.
Reimplemented in CWeightedCommWordStringKernel.
Definition at line 65 of file CommWordStringKernel.cpp.
bool init_dictionary | ( | int32_t | size | ) | [virtual] |
bool init_optimization | ( | int32_t | count, | |
int32_t * | IDX, | |||
float64_t * | weights | |||
) | [virtual] |
initialize optimization
count | count | |
IDX | index | |
weights | weights |
Reimplemented from CKernel.
Definition at line 292 of file CommWordStringKernel.cpp.
void set_use_dict_diagonal_optimization | ( | bool | flag | ) |
set_use_dict_diagonal_optimization
flag | enable diagonal optimization |
Definition at line 192 of file CommWordStringKernel.h.
friend class CAvgDiagKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 50 of file CommWordStringKernel.h.
friend class CDiceKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 54 of file CommWordStringKernel.h.
friend class CFirstElementKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 52 of file CommWordStringKernel.h.
friend class CRidgeKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 51 of file CommWordStringKernel.h.
friend class CSqrtDiagKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 49 of file CommWordStringKernel.h.
friend class CTanimotoKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 53 of file CommWordStringKernel.h.
friend class CVarianceKernelNormalizer [friend] |
Reimplemented from CKernel.
Definition at line 48 of file CommWordStringKernel.h.
int32_t* dict_diagonal_optimization [protected] |
array to hold counters for all strings
Definition at line 253 of file CommWordStringKernel.h.
int32_t dictionary_size [protected] |
size of dictionary (number of possible strings)
Definition at line 242 of file CommWordStringKernel.h.
float64_t* dictionary_weights [protected] |
dictionary weights - array to hold counters for all possible strings
Definition at line 245 of file CommWordStringKernel.h.
bool use_dict_diagonal_optimization [protected] |
whether diagonal optimization shall be used
Definition at line 251 of file CommWordStringKernel.h.
bool use_sign [protected] |
if sign shall be used
Definition at line 248 of file CommWordStringKernel.h.