Template class StringFeatures implements a list of strings.
As this class is a template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
When preprocessors are attached to string features they may shorten the string, but are not allowed to return strings longer than max_string_length, as some algorithms depend on this.
Also note that string features cannot currently be computed on-the-fly.
(Partly) subset access is supported for this feature type. Simple use the (inherited) set_subset(), remove_subset() functions. If done, all calls that work with features are translated to the subset. See comments to find out whether it is supported for that method
Definition at line 90 of file StringFeatures.h.
Public Member Functions | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (SGStringList< ST > string_list, EAlphabet alpha) | |
CStringFeatures (SGStringList< ST > string_list, CAlphabet *alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (CFile *loader, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
SGVector< ST > | get_feature_vector (int32_t num) |
void | set_feature_vector (SGVector< ST > vector, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
CStringFeatures< ST > * | get_transposed () |
SGString< ST > * | get_transposed (int32_t &num_feat, int32_t &num_vec) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
void | free_feature_vector (SGVector< ST > feat_vec, int32_t num) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () const |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual void | load (CFile *loader) |
void | load_ascii_file (char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
void | set_features (SGStringList< ST > feats) |
bool | set_features (SGString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
bool | append_features (CStringFeatures< ST > *sf) |
bool | append_features (SGString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
SGStringList< ST > | get_features () |
virtual SGString< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual SGString< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (SGString< ST > **dst, int32_t *num_str) |
virtual void | save (CFile *writer) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preprocessor (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual void | get_histogram (float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true) |
virtual void | create_random (float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec) |
virtual CFeatures * | copy_subset (SGVector< index_t > indices) |
virtual const char * | get_name () const |
virtual void | subset_changed_post () |
Static Public Member Functions | |
static ST * | get_zero_terminated_string_copy (SGString< ST > str) |
Protected Member Functions | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
Protected Attributes | |
CAlphabet * | alphabet |
int32_t | num_vectors |
SGString< ST > * | features |
ST * | single_string |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
CStringFeatures | ( | ) |
default constructor
Definition at line 96 of file StringFeatures.h.
CStringFeatures | ( | EAlphabet | alpha | ) |
constructor
alpha | alphabet (type) to use for string features |
Definition at line 106 of file StringFeatures.h.
CStringFeatures | ( | SGStringList< ST > | string_list, | |
EAlphabet | alpha | |||
) |
constructor
string_list | ||
alpha | alphabet (type) to use for string features |
Definition at line 120 of file StringFeatures.h.
CStringFeatures | ( | SGStringList< ST > | string_list, | |
CAlphabet * | alpha | |||
) |
constructor
string_list | ||
alpha | an actual alphabet |
Definition at line 136 of file StringFeatures.h.
CStringFeatures | ( | CAlphabet * | alpha | ) |
constructor
alpha | alphabet to use for string features |
Definition at line 152 of file StringFeatures.h.
CStringFeatures | ( | const CStringFeatures< ST > & | orig | ) |
copy constructor
Definition at line 165 of file StringFeatures.h.
CStringFeatures | ( | CFile * | loader, | |
EAlphabet | alpha = DNA | |||
) |
constructor
loader | File object via which to load data | |
alpha | alphabet (type) to use for string features |
Definition at line 209 of file StringFeatures.h.
virtual ~CStringFeatures | ( | ) | [virtual] |
Definition at line 224 of file StringFeatures.h.
bool append_features | ( | CStringFeatures< ST > * | sf | ) |
append features If the given string features have a subset, only this will be copied
not possible with subset
sf | features to append |
Definition at line 1178 of file StringFeatures.h.
bool append_features | ( | SGString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
append features
not possible with subset
p_features | features to append | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
note that p_features will be SG_FREE()'d on success
Definition at line 1212 of file StringFeatures.h.
virtual bool apply_preprocessor | ( | bool | force_preprocessing = false |
) | [virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
Definition at line 1531 of file StringFeatures.h.
virtual void cleanup | ( | ) | [virtual] |
cleanup string features.
removes any subset before
Reimplemented in CStringFileFeatures< ST >.
Definition at line 236 of file StringFeatures.h.
virtual void cleanup_feature_vector | ( | int32_t | num | ) | [virtual] |
cleanup a single feature vector
possible with subset
num | number of the vector |
Reimplemented in CStringFileFeatures< ST >.
Definition at line 273 of file StringFeatures.h.
virtual ST* compute_feature_vector | ( | int32_t | num, | |
int32_t & | len | |||
) | [protected, virtual] |
compute feature vector for sample num if target is set the vector is written to target len is returned by reference
possible with subset
num | which vector | |
len | length of vector |
Definition at line 2199 of file StringFeatures.h.
void compute_symbol_mask_table | ( | int64_t | max_val | ) |
compute symbol mask table
required to access bit-based symbols
not implemented for subset
Definition at line 1869 of file StringFeatures.h.
virtual SGString<ST>* copy_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
copy_features
possible with subset
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1302 of file StringFeatures.h.
Creates a new CFeatures instance containing copies of the elements which are specified by the provided indices.
possible with subset
indices | indices of feature elements to copy |
Reimplemented from CFeatures.
Definition at line 2150 of file StringFeatures.h.
virtual void create_random | ( | float64_t * | hist, | |
int32_t | rows, | |||
int32_t | cols, | |||
int32_t | num_vec | |||
) | [virtual] |
create some random strings based on normalized histogram
not possible with subset
Definition at line 2040 of file StringFeatures.h.
void determine_maximum_string_length | ( | ) |
determine new maximum string length
possible with subset
Definition at line 1940 of file StringFeatures.h.
void disable_on_the_fly_preprocessing | ( | ) |
call this to disable on the fly feature preprocessing on get_feature_vector. Useful when you manually apply preprocessors.
Definition at line 384 of file StringFeatures.h.
virtual CFeatures* duplicate | ( | ) | const [virtual] |
duplicate feature object
Implements CFeatures.
Definition at line 314 of file StringFeatures.h.
void embed_features | ( | int32_t | p_order | ) |
embed string features in bit representation in-place
not implemented for subset
Definition at line 1809 of file StringFeatures.h.
ST embed_word | ( | ST * | seq, | |
int32_t | len | |||
) |
embed a single word
seq | sequence of size len in a bitfield | |
len |
Definition at line 1923 of file StringFeatures.h.
void enable_on_the_fly_preprocessing | ( | ) |
call this to preprocess string features upon get_feature_vector
Definition at line 376 of file StringFeatures.h.
void free_feature_vector | ( | ST * | feat_vec, | |
int32_t | num, | |||
bool | dofree | |||
) |
free feature vector
possible with subset
feat_vec | feature vector to free | |
num | index in feature cache, possibly from subset | |
dofree | if vector should be really deleted |
Definition at line 508 of file StringFeatures.h.
void free_feature_vector | ( | SGVector< ST > | feat_vec, | |
int32_t | num | |||
) |
free feature vector
possible with subset
feat_vec | feature vector to free | |
num | index in feature cache, possibly from subset |
Definition at line 533 of file StringFeatures.h.
CAlphabet* get_alphabet | ( | ) |
get alphabet used in string features
Definition at line 304 of file StringFeatures.h.
virtual ST get_feature | ( | int32_t | vec_num, | |
int32_t | feat_num | |||
) | [virtual] |
get feature
possible with subset
vec_num | which vector | |
feat_num | which feature, possibly from subset |
Definition at line 559 of file StringFeatures.h.
virtual EFeatureClass get_feature_class | ( | ) | [virtual] |
get feature class
Implements CFeatures.
Definition at line 292 of file StringFeatures.h.
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
get feature type
Implements CFeatures.
Definition at line 298 of file StringFeatures.h.
SGVector<ST> get_feature_vector | ( | int32_t | num | ) |
get string for selected example num
possible with subset
num | index of the string |
Definition at line 325 of file StringFeatures.h.
ST* get_feature_vector | ( | int32_t | num, | |
int32_t & | len, | |||
bool & | dofree | |||
) |
get feature vector for sample num
possible with subset
num | index of feature vector | |
len | length is returned by reference | |
dofree | whether returned vector must be freed by caller via free_feature_vector |
Definition at line 399 of file StringFeatures.h.
SGStringList<ST> get_features | ( | ) |
virtual SGString<ST>* get_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
get_features
not possible with subset
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1284 of file StringFeatures.h.
virtual void get_features | ( | SGString< ST > ** | dst, | |
int32_t * | num_str | |||
) | [virtual] |
get_features (swig compatible)
possible with subset
dst | string features (returned) | |
num_str | number of strings (returned) |
Definition at line 1331 of file StringFeatures.h.
virtual void get_histogram | ( | float64_t ** | hist, | |
int32_t * | rows, | |||
int32_t * | cols, | |||
bool | normalize = true | |||
) | [virtual] |
compute histogram over strings
possible with subset
Definition at line 1994 of file StringFeatures.h.
ST get_masked_symbols | ( | ST | symbol, | |
uint8_t | mask | |||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask | |
mask | mask to apply |
Definition at line 646 of file StringFeatures.h.
floatmax_t get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
Definition at line 623 of file StringFeatures.h.
virtual int32_t get_max_vector_length | ( | ) | [virtual] |
get maximum vector length
this one is updated when a subset is set
Definition at line 597 of file StringFeatures.h.
virtual const char* get_name | ( | void | ) | const [virtual] |
floatmax_t get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
Definition at line 614 of file StringFeatures.h.
virtual int32_t get_num_vectors | ( | ) | const [virtual] |
Implements CFeatures.
Definition at line 603 of file StringFeatures.h.
int32_t get_order | ( | ) |
floatmax_t get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
Definition at line 631 of file StringFeatures.h.
virtual int32_t get_size | ( | ) | [virtual] |
get memory footprint of one feature
Implements CFeatures.
Definition at line 1524 of file StringFeatures.h.
CStringFeatures<ST>* get_transposed | ( | ) |
get a transposed copy of the features
possible with subset
Definition at line 443 of file StringFeatures.h.
SGString<ST>* get_transposed | ( | int32_t & | num_feat, | |
int32_t & | num_vec | |||
) |
compute and return the transpose of string features matrix which will be prepocessed. num_feat, num_vectors are returned by reference caller has to clean up
note that strings all have to have same length
possible with subset
num_feat | number of features in matrix | |
num_vec | number of vectors in matrix |
Definition at line 469 of file StringFeatures.h.
virtual int32_t get_vector_length | ( | int32_t | vec_num | ) | [virtual] |
get vector length
possible with subset
vec_num | which vector, possibly from subset |
Definition at line 580 of file StringFeatures.h.
static ST* get_zero_terminated_string_copy | ( | SGString< ST > | str | ) | [static] |
get a zero terminated copy of the string
str | the string to copy |
note that this function is only sensible for character strings
Definition at line 1958 of file StringFeatures.h.
bool have_same_length | ( | int32_t | len = -1 |
) |
check if length of each vector in this feature object equals the given length. if existant, only subset is checked
possible for subset
len | vector length to check against |
Definition at line 1785 of file StringFeatures.h.
virtual void load | ( | CFile * | loader | ) | [virtual] |
load features from file
loader | File object via which to load data |
Reimplemented from CFeatures.
void load_ascii_file | ( | char * | fname, | |
bool | remap_to_bin = true , |
|||
EAlphabet | ascii_alphabet = DNA , |
|||
EAlphabet | binary_alphabet = RAWDNA | |||
) |
load ascii line-based string features from file.
any subset is removed before
fname | filename to load from | |
remap_to_bin | if translation to other binary alphabet should be performed | |
ascii_alphabet | src alphabet | |
binary_alphabet | alphabet to translate to |
Definition at line 692 of file StringFeatures.h.
virtual bool load_compressed | ( | char * | src, | |
bool | decompress | |||
) | [virtual] |
load compressed features from file
any subset is removed before
src | filename to load from | |
decompress | whether to decompress on loading |
Definition at line 1355 of file StringFeatures.h.
bool load_fasta_file | ( | const char * | fname, | |
bool | ignore_invalid = false | |||
) |
load fasta file as string features
any subset is removed before
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
Definition at line 833 of file StringFeatures.h.
bool load_fastq_file | ( | const char * | fname, | |
bool | ignore_invalid = false , |
|||
bool | bitremap_in_single_string = false | |||
) |
load fastq file as string features
removes subset beforehand
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A | |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
Definition at line 935 of file StringFeatures.h.
bool load_from_directory | ( | char * | dirname | ) |
load features from directory
removes subset before
dirname | directory name to load from |
Definition at line 1044 of file StringFeatures.h.
int32_t obtain_by_position_list | ( | int32_t | window_size, | |
CDynamicArray< int32_t > * | positions, | |||
int32_t | skip = 0 | |||
) |
extracts windows of size window_size from first string using the positions in list
not implemented for subset
window_size | window size | |
positions | positions | |
skip | skip |
Definition at line 1614 of file StringFeatures.h.
int32_t obtain_by_sliding_window | ( | int32_t | window_size, | |
int32_t | step_size, | |||
int32_t | skip = 0 | |||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
not implemented for subset
window_size | window size | |
step_size | step size | |
skip | skip |
Definition at line 1567 of file StringFeatures.h.
bool obtain_from_char | ( | CStringFeatures< char > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
obtain string features from char features
wrapper for template method
any subset is removed before, subset of parameter sf is possible
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1684 of file StringFeatures.h.
bool obtain_from_char_features | ( | CStringFeatures< CT > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
template obtain from char features
any subset is removed before, subset of parameter sf is possible
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1701 of file StringFeatures.h.
virtual void save | ( | CFile * | writer | ) | [virtual] |
save features to file
not possible with subset
writer | File object via which to save data |
Reimplemented from CFeatures.
virtual bool save_compressed | ( | char * | dest, | |
E_COMPRESSION_TYPE | compression, | |||
int | level | |||
) | [virtual] |
save compressed features to file
not possible with subset
dest | filename to save to | |
compression | compressor to use | |
level | compression level to use (1-9) |
Definition at line 1459 of file StringFeatures.h.
void set_feature_vector | ( | SGVector< ST > | vector, | |
int32_t | num | |||
) |
set string for selected example num
not possible with subset
vector | ||
num | index of the string |
Definition at line 350 of file StringFeatures.h.
virtual void set_feature_vector | ( | int32_t | num, | |
ST * | string, | |||
int32_t | len | |||
) | [virtual] |
set feature vector for sample num
possible with subset
num | index of feature vector | |
string | string with the feature vector's content | |
len | length of the string |
Definition at line 1975 of file StringFeatures.h.
void set_features | ( | SGStringList< ST > | feats | ) |
bool set_features | ( | SGString< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
set features
not possible with subset
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
Definition at line 1133 of file StringFeatures.h.
ST shift_offset | ( | ST | offset, | |
int32_t | amount | |||
) |
shift offset to the left by amount
offset | offset to shift | |
amount | amount to shift the offset |
Definition at line 658 of file StringFeatures.h.
ST shift_symbol | ( | ST | symbol, | |
int32_t | amount | |||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift | |
amount | amount to shift the symbol |
Definition at line 670 of file StringFeatures.h.
virtual void subset_changed_post | ( | ) | [virtual] |
post method when subset is changed
Reimplemented from CFeatures.
Definition at line 2182 of file StringFeatures.h.
void unembed_word | ( | ST | word, | |
uint8_t * | seq, | |||
int32_t | len | |||
) |
remap bit-based word to character sequence
word | word to remap | |
seq | sequence of size len that remapped characters are written to | |
len | length of sequence and word |
Definition at line 1902 of file StringFeatures.h.
alphabet
Definition at line 2258 of file StringFeatures.h.
CCache<ST>* feature_cache [protected] |
feature cache
Definition at line 2291 of file StringFeatures.h.
this contains the array of features
Definition at line 2264 of file StringFeatures.h.
int32_t length_of_single_string [protected] |
length of prior single string
Definition at line 2270 of file StringFeatures.h.
int32_t max_string_length [protected] |
length of longest string (for subset, is updated)
Definition at line 2273 of file StringFeatures.h.
floatmax_t num_symbols [protected] |
number of used symbols
Definition at line 2276 of file StringFeatures.h.
int32_t num_vectors [protected] |
number of string vectors (for subset, is updated)
Definition at line 2261 of file StringFeatures.h.
int32_t order [protected] |
order used in higher order mapping
Definition at line 2282 of file StringFeatures.h.
floatmax_t original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
Definition at line 2279 of file StringFeatures.h.
bool preprocess_on_get [protected] |
preprocess on-the-fly?
Definition at line 2288 of file StringFeatures.h.
ST* single_string [protected] |
true when single string / created by sliding window
Definition at line 2267 of file StringFeatures.h.
ST* symbol_mask_table [protected] |
order used in higher order mapping
Definition at line 2285 of file StringFeatures.h.