Template class StringFeatures implements a list of strings.
As this class is a template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
When preprocessors are attached to string features they may shorten the string, but are not allowed to return strings longer than max_string_length, as some algorithms depend on this.
Also note that string features cannot currently be computed on-the-fly.
Definition at line 83 of file StringFeatures.h.
Public Member Functions | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (TString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, EAlphabet alpha) | |
CStringFeatures (TString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, CAlphabet *alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (CFile *loader, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
void | get_feature_vector (ST **dst, int32_t *len, int32_t num) |
void | set_feature_vector (ST *src, int32_t len, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
CStringFeatures< ST > * | get_transposed () |
TString< ST > * | get_transposed (int32_t &num_feat, int32_t &num_vec) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual void | load (CFile *loader) |
void | load_ascii_file (char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
bool | set_features (TString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
bool | append_features (CStringFeatures< ST > *sf) |
bool | append_features (TString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
virtual TString< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual TString< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (TString< ST > **dst, int32_t *num_str) |
virtual void | save (CFile *writer) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preproc (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual void | get_histogram (float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true) |
virtual void | create_random (float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec) |
virtual const char * | get_name () const |
Static Public Member Functions | |
static ST * | get_zero_terminated_string_copy (TString< ST > str) |
Protected Member Functions | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
Protected Attributes | |
CAlphabet * | alphabet |
alphabet | |
int32_t | num_vectors |
number of string vectors | |
TString< ST > * | features |
this contains the array of features. | |
ST * | single_string |
true when single string / created by sliding window | |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
length of longest string | |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
CStringFeatures | ( | ) |
default constructor
Definition at line 89 of file StringFeatures.h.
CStringFeatures | ( | EAlphabet | alpha | ) |
constructor
alpha | alphabet (type) to use for string features |
Definition at line 102 of file StringFeatures.h.
CStringFeatures | ( | TString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length, | |||
EAlphabet | alpha | |||
) |
constructor
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length | |
alpha | alphabet (type) to use for string features |
Definition at line 123 of file StringFeatures.h.
CStringFeatures | ( | TString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length, | |||
CAlphabet * | alpha | |||
) |
constructor
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length | |
alpha | an actual alphabet |
Definition at line 146 of file StringFeatures.h.
CStringFeatures | ( | CAlphabet * | alpha | ) |
constructor
alpha | alphabet to use for string features |
Definition at line 166 of file StringFeatures.h.
CStringFeatures | ( | const CStringFeatures< ST > & | orig | ) |
copy constructor
Definition at line 182 of file StringFeatures.h.
CStringFeatures | ( | CFile * | loader, | |
EAlphabet | alpha = DNA | |||
) |
constructor
loader | File object via which to load data | |
alpha | alphabet (type) to use for string features |
Definition at line 224 of file StringFeatures.h.
virtual ~CStringFeatures | ( | ) | [virtual] |
Definition at line 238 of file StringFeatures.h.
bool append_features | ( | CStringFeatures< ST > * | sf | ) |
append features
sf | features to append |
Definition at line 1083 of file StringFeatures.h.
bool append_features | ( | TString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
append features
p_features | features to append | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
note that p_features will be delete[]'d on success
Definition at line 1109 of file StringFeatures.h.
virtual bool apply_preproc | ( | bool | force_preprocessing = false |
) | [virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
Definition at line 1381 of file StringFeatures.h.
virtual void cleanup | ( | ) | [virtual] |
cleanup string features
Reimplemented in CStringFileFeatures< ST >.
Definition at line 246 of file StringFeatures.h.
virtual void cleanup_feature_vector | ( | int32_t | num | ) | [virtual] |
cleanup a single feature vector
Reimplemented in CStringFileFeatures< ST >.
Definition at line 276 of file StringFeatures.h.
virtual ST* compute_feature_vector | ( | int32_t | num, | |
int32_t & | len | |||
) | [protected, virtual] |
compute feature vector for sample num if target is set the vector is written to target len is returned by reference
default implementation returns
num | which vector | |
len | length of vector |
Definition at line 1968 of file StringFeatures.h.
void compute_symbol_mask_table | ( | int64_t | max_val | ) |
compute symbol mask table
required to access bit-based symbols
Definition at line 1694 of file StringFeatures.h.
virtual TString<ST>* copy_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
copy_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1178 of file StringFeatures.h.
virtual void create_random | ( | float64_t * | hist, | |
int32_t | rows, | |||
int32_t | cols, | |||
int32_t | num_vec | |||
) | [virtual] |
create some random strings based on normalized histogram
Definition at line 1849 of file StringFeatures.h.
void determine_maximum_string_length | ( | ) |
determine new maximum string length
Definition at line 1760 of file StringFeatures.h.
void disable_on_the_fly_preprocessing | ( | ) |
call this to disable on the fly feature preprocessing on get_feature_vector. Useful when you manually apply preprocessors.
Definition at line 380 of file StringFeatures.h.
virtual CFeatures* duplicate | ( | ) | const [virtual] |
duplicate feature object
Implements CFeatures.
Definition at line 313 of file StringFeatures.h.
void embed_features | ( | int32_t | p_order | ) |
embed string features in bit representation in-place
Definition at line 1639 of file StringFeatures.h.
ST embed_word | ( | ST * | seq, | |
int32_t | len | |||
) |
embed a single word
seq | sequence of size len in a bitfield | |
len |
Definition at line 1745 of file StringFeatures.h.
void enable_on_the_fly_preprocessing | ( | ) |
call this to preprocess string features upon get_feature_vector
Definition at line 372 of file StringFeatures.h.
void free_feature_vector | ( | ST * | feat_vec, | |
int32_t | num, | |||
bool | dofree | |||
) |
free feature vector
feat_vec | feature vector to free | |
num | index in feature cache | |
dofree | if vector should be really deleted |
Definition at line 489 of file StringFeatures.h.
CAlphabet* get_alphabet | ( | ) |
get alphabet used in string features
Definition at line 303 of file StringFeatures.h.
virtual ST get_feature | ( | int32_t | vec_num, | |
int32_t | feat_num | |||
) | [virtual] |
get feature
vec_num | which vector | |
feat_num | which feature |
Definition at line 504 of file StringFeatures.h.
virtual EFeatureClass get_feature_class | ( | ) | [virtual] |
get feature class
Implements CFeatures.
Definition at line 291 of file StringFeatures.h.
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
get feature type
Implements CFeatures.
Definition at line 297 of file StringFeatures.h.
ST* get_feature_vector | ( | int32_t | num, | |
int32_t & | len, | |||
bool & | dofree | |||
) |
get feature vector for sample num
num | index of feature vector | |
len | length is returned by reference | |
dofree | whether returned vector must be freed by caller via free_feature_vector |
Definition at line 393 of file StringFeatures.h.
void get_feature_vector | ( | ST ** | dst, | |
int32_t * | len, | |||
int32_t | num | |||
) |
get string for selected example num
dst | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 324 of file StringFeatures.h.
virtual TString<ST>* get_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
get_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1165 of file StringFeatures.h.
virtual void get_features | ( | TString< ST > ** | dst, | |
int32_t * | num_str | |||
) | [virtual] |
get_features (swig compatible)
dst | string features (returned) | |
num_str | number of strings (returned) |
Definition at line 1205 of file StringFeatures.h.
virtual void get_histogram | ( | float64_t ** | hist, | |
int32_t * | rows, | |||
int32_t * | cols, | |||
bool | normalize = true | |||
) | [virtual] |
compute histogram over strings
Definition at line 1804 of file StringFeatures.h.
ST get_masked_symbols | ( | ST | symbol, | |
uint8_t | mask | |||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask | |
mask | mask to apply |
Definition at line 583 of file StringFeatures.h.
floatmax_t get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
Definition at line 560 of file StringFeatures.h.
virtual int32_t get_max_vector_length | ( | ) | [virtual] |
get maximum vector length
Definition at line 534 of file StringFeatures.h.
virtual const char* get_name | ( | void | ) | const [virtual] |
floatmax_t get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
Definition at line 551 of file StringFeatures.h.
virtual int32_t get_num_vectors | ( | ) | [virtual] |
get number of vectors
Implements CFeatures.
Definition at line 543 of file StringFeatures.h.
int32_t get_order | ( | ) |
floatmax_t get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
Definition at line 568 of file StringFeatures.h.
virtual int32_t get_size | ( | ) | [virtual] |
get memory footprint of one feature
Implements CFeatures.
Definition at line 1374 of file StringFeatures.h.
TString<ST>* get_transposed | ( | int32_t & | num_feat, | |
int32_t & | num_vec | |||
) |
compute and return the transpose of string features matrix which will be prepocessed. num_feat, num_vectors are returned by reference caller has to clean up
note that strings all have to have same length
num_feat | number of features in matrix | |
num_vec | number of vectors in matrix |
Definition at line 452 of file StringFeatures.h.
CStringFeatures<ST>* get_transposed | ( | ) |
get a transposed copy of the features
Definition at line 432 of file StringFeatures.h.
virtual int32_t get_vector_length | ( | int32_t | vec_num | ) | [virtual] |
get vector length
vec_num | which vector |
Definition at line 521 of file StringFeatures.h.
static ST* get_zero_terminated_string_copy | ( | TString< ST > | str | ) | [static] |
get a zero terminated copy of the string
str | the string to copy |
note that this function is only sensible for character strings
Definition at line 1775 of file StringFeatures.h.
bool have_same_length | ( | int32_t | len = -1 |
) |
check if length of each vector in this feature object equals the given length.
len | vector length to check against |
Definition at line 1617 of file StringFeatures.h.
virtual void load | ( | CFile * | loader | ) | [virtual] |
load features from file
loader | File object via which to load data |
Reimplemented from CFeatures.
void load_ascii_file | ( | char * | fname, | |
bool | remap_to_bin = true , |
|||
EAlphabet | ascii_alphabet = DNA , |
|||
EAlphabet | binary_alphabet = RAWDNA | |||
) |
load ascii line-based string features from file
fname | filename to load from | |
remap_to_bin | if translation to other binary alphabet should be performed | |
ascii_alphabet | src alphabet | |
binary_alphabet | alphabet to translate to |
Definition at line 627 of file StringFeatures.h.
virtual bool load_compressed | ( | char * | src, | |
bool | decompress | |||
) | [virtual] |
load compressed features from file
src | filename to load from | |
decompress | whether to decompress on loading |
Definition at line 1225 of file StringFeatures.h.
bool load_fasta_file | ( | const char * | fname, | |
bool | ignore_invalid = false | |||
) |
load fasta file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
Definition at line 766 of file StringFeatures.h.
bool load_fastq_file | ( | const char * | fname, | |
bool | ignore_invalid = false , |
|||
bool | bitremap_in_single_string = false | |||
) |
load fastq file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A | |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
Definition at line 865 of file StringFeatures.h.
bool load_from_directory | ( | char * | dirname | ) |
load features from directory
dirname | directory name to load from |
Definition at line 970 of file StringFeatures.h.
int32_t obtain_by_position_list | ( | int32_t | window_size, | |
CDynamicArray< int32_t > * | positions, | |||
int32_t | skip = 0 | |||
) |
extracts windows of size window_size from first string using the positions in list
window_size | window size | |
positions | positions | |
skip | skip |
Definition at line 1456 of file StringFeatures.h.
int32_t obtain_by_sliding_window | ( | int32_t | window_size, | |
int32_t | step_size, | |||
int32_t | skip = 0 | |||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
window_size | window size | |
step_size | step size | |
skip | skip |
Definition at line 1414 of file StringFeatures.h.
bool obtain_from_char | ( | CStringFeatures< char > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
obtain string features from char features
wrapper for template method
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1521 of file StringFeatures.h.
bool obtain_from_char_features | ( | CStringFeatures< CT > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
template obtain from char features
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1536 of file StringFeatures.h.
virtual void save | ( | CFile * | writer | ) | [virtual] |
save features to file
writer | File object via which to save data |
Reimplemented from CFeatures.
virtual bool save_compressed | ( | char * | dest, | |
E_COMPRESSION_TYPE | compression, | |||
int | level | |||
) | [virtual] |
save compressed features to file
dest | filename to save to | |
compression | compressor to use | |
level | compression level to use (1-9) |
Definition at line 1312 of file StringFeatures.h.
void set_feature_vector | ( | ST * | src, | |
int32_t | len, | |||
int32_t | num | |||
) |
set string for selected example num
src | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 349 of file StringFeatures.h.
virtual void set_feature_vector | ( | int32_t | num, | |
ST * | string, | |||
int32_t | len | |||
) | [virtual] |
set feature vector for sample num
num | index of feature vector | |
string | string with the feature vector's content | |
len | length of the string |
Definition at line 1790 of file StringFeatures.h.
bool set_features | ( | TString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
set features
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
Definition at line 1044 of file StringFeatures.h.
ST shift_offset | ( | ST | offset, | |
int32_t | amount | |||
) |
shift offset to the left by amount
offset | offset to shift | |
amount | amount to shift the offset |
Definition at line 595 of file StringFeatures.h.
ST shift_symbol | ( | ST | symbol, | |
int32_t | amount | |||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift | |
amount | amount to shift the symbol |
Definition at line 607 of file StringFeatures.h.
void unembed_word | ( | ST | word, | |
uint8_t * | seq, | |||
int32_t | len | |||
) |
remap bit-based word to character sequence
word | word to remap | |
seq | sequence of size len that remapped characters are written to | |
len | length of sequence and word |
Definition at line 1724 of file StringFeatures.h.
alphabet
Definition at line 2014 of file StringFeatures.h.
CCache<ST>* feature_cache [protected] |
feature cache
Definition at line 2047 of file StringFeatures.h.
this contains the array of features.
Definition at line 2020 of file StringFeatures.h.
int32_t length_of_single_string [protected] |
length of prior single string
Definition at line 2026 of file StringFeatures.h.
int32_t max_string_length [protected] |
length of longest string
Definition at line 2029 of file StringFeatures.h.
floatmax_t num_symbols [protected] |
number of used symbols
Definition at line 2032 of file StringFeatures.h.
int32_t num_vectors [protected] |
number of string vectors
Definition at line 2017 of file StringFeatures.h.
int32_t order [protected] |
order used in higher order mapping
Definition at line 2038 of file StringFeatures.h.
floatmax_t original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
Definition at line 2035 of file StringFeatures.h.
bool preprocess_on_get [protected] |
preprocess on-the-fly?
Definition at line 2044 of file StringFeatures.h.
ST* single_string [protected] |
true when single string / created by sliding window
Definition at line 2023 of file StringFeatures.h.
ST* symbol_mask_table [protected] |
order used in higher order mapping
Definition at line 2041 of file StringFeatures.h.