The class Alphabet implements an alphabet and alphabet utility functions.
These utility functions can be used to remap characters to more (bit-)efficient representations, check if a string is valid, compute histograms etc.
Currently supported alphabets are DNA, RAWDNA, RNA, PROTEIN, BINARY, ALPHANUM, CUBE, RAW, IUPAC_NUCLEIC_ACID and IUPAC_AMINO_ACID.
Definition at line 89 of file Alphabet.h.
Public Member Functions | |
CAlphabet () | |
CAlphabet (char *alpha, int32_t len) | |
CAlphabet (EAlphabet alpha) | |
CAlphabet (CAlphabet *alpha) | |
virtual | ~CAlphabet () |
bool | set_alphabet (EAlphabet alpha) |
EAlphabet | get_alphabet () const |
int32_t | get_num_symbols () const |
int32_t | get_num_bits () const |
uint8_t | remap_to_bin (uint8_t c) |
uint8_t | remap_to_char (uint8_t c) |
void | clear_histogram () |
clear histogram | |
template<class T > | |
void | add_string_to_histogram (T *p, int64_t len) |
void | add_byte_to_histogram (uint8_t p) |
void | print_histogram () |
print histogram | |
void | get_hist (int64_t **h, int32_t *len) |
const int64_t * | get_histogram () |
get pointer to histogram | |
bool | check_alphabet (bool print_error=true) |
bool | is_valid (uint8_t c) |
bool | check_alphabet_size (bool print_error=true) |
int32_t | get_num_symbols_in_histogram () |
int32_t | get_max_value_in_histogram () |
int32_t | get_num_bits_in_histogram () |
virtual const char * | get_name () const |
Static Public Member Functions | |
static const char * | get_alphabet_name (EAlphabet alphabet) |
template<class ST > | |
static void | translate_from_single_order (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) |
template<class ST > | |
static void | translate_from_single_order_reversed (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) |
template<class ST > | |
static void | translate_from_single_order (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) |
template<class ST > | |
static void | translate_from_single_order_reversed (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) |
Static Public Attributes | |
static const uint8_t | B_A = 0 |
static const uint8_t | B_C = 1 |
static const uint8_t | B_G = 2 |
static const uint8_t | B_T = 3 |
static const uint8_t | B_0 = 4 |
static const uint8_t | MAPTABLE_UNDEF = 0xff |
static const char * | alphabet_names [18] |
Protected Member Functions | |
void | init_map_table () |
void | copy_histogram (CAlphabet *src) |
virtual void | load_serializable_post (void) throw (ShogunException) |
Protected Attributes | |
EAlphabet | alphabet |
int32_t | num_symbols |
int32_t | num_bits |
bool | valid_chars [1<< (sizeof(uint8_t)*8)] |
uint8_t | maptable_to_bin [1<< (sizeof(uint8_t)*8)] |
uint8_t | maptable_to_char [1<< (sizeof(uint8_t)*8)] |
int64_t | histogram [1<< (sizeof(uint8_t)*8)] |
CAlphabet | ( | ) |
default constructor
Definition at line 34 of file Alphabet.cpp.
CAlphabet | ( | char * | alpha, | |
int32_t | len | |||
) |
~CAlphabet | ( | ) | [virtual] |
Definition at line 103 of file Alphabet.cpp.
void add_byte_to_histogram | ( | uint8_t | p | ) |
void add_string_to_histogram | ( | T * | p, | |
int64_t | len | |||
) |
make histogram for whole string
p | string | |
len | length of string |
Definition at line 181 of file Alphabet.h.
bool check_alphabet | ( | bool | print_error = true |
) |
check whether symbols in histogram are valid in alphabet e.g. for DNA if only letters ACGT appear
print_error | if errors shall be printed |
Definition at line 594 of file Alphabet.cpp.
bool check_alphabet_size | ( | bool | print_error = true |
) |
check whether symbols in histogram ALL fit in alphabet
print_error | if errors shall be printed |
Definition at line 616 of file Alphabet.cpp.
void clear_histogram | ( | ) |
clear histogram
Definition at line 543 of file Alphabet.cpp.
void copy_histogram | ( | CAlphabet * | src | ) | [protected] |
copy histogram
src | alphabet to copy histogram from |
Definition at line 633 of file Alphabet.cpp.
EAlphabet get_alphabet | ( | ) | const |
const char * get_alphabet_name | ( | EAlphabet | alphabet | ) | [static] |
return alphabet name
alphabet | alphabet type to get name from |
Definition at line 638 of file Alphabet.cpp.
void get_hist | ( | int64_t ** | h, | |
int32_t * | len | |||
) |
get histogram
h | where the histogram will be stored | |
len | length of histogram |
Definition at line 204 of file Alphabet.h.
const int64_t* get_histogram | ( | ) |
get pointer to histogram
Definition at line 215 of file Alphabet.h.
int32_t get_max_value_in_histogram | ( | ) |
return maximum value in histogram
Definition at line 549 of file Alphabet.cpp.
virtual const char* get_name | ( | void | ) | const [virtual] |
int32_t get_num_bits | ( | ) | const |
get number of bits necessary to store all symbols in alphabet
Definition at line 147 of file Alphabet.h.
int32_t get_num_bits_in_histogram | ( | ) |
return number of bits required to store all symbols in histogram
Definition at line 576 of file Alphabet.cpp.
int32_t get_num_symbols | ( | ) | const |
get number of symbols in alphabet
Definition at line 137 of file Alphabet.h.
int32_t get_num_symbols_in_histogram | ( | ) |
return number of symbols in histogram
Definition at line 564 of file Alphabet.cpp.
void init_map_table | ( | ) | [protected] |
init map table
Definition at line 178 of file Alphabet.cpp.
bool is_valid | ( | uint8_t | c | ) |
check whether symbols are valid in alphabet e.g. for DNA if symbol is one of the A,C,G or T
c | symbol |
Definition at line 234 of file Alphabet.h.
void load_serializable_post | ( | void | ) | throw (ShogunException) [protected, virtual] |
Can (optionally) be overridden to post-initialize some member variables which are not PARAMETER::ADD'ed. Make sure that at first the overridden method BASE_CLASS::LOAD_SERIALIZABLE_POST is called.
ShogunException | Will be thrown if an error occurres. |
Reimplemented from CSGObject.
Definition at line 718 of file Alphabet.cpp.
void print_histogram | ( | ) |
print histogram
Definition at line 585 of file Alphabet.cpp.
uint8_t remap_to_bin | ( | uint8_t | c | ) |
remap element e.g translate ACGT to 0123
c | element to remap |
Definition at line 157 of file Alphabet.h.
uint8_t remap_to_char | ( | uint8_t | c | ) |
remap element e.g translate 0123 to ACGT
c | element to remap |
Definition at line 167 of file Alphabet.h.
bool set_alphabet | ( | EAlphabet | alpha | ) |
set alphabet and initialize mapping table (for remap)
alpha | new alphabet |
Definition at line 107 of file Alphabet.cpp.
static void translate_from_single_order | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val | |||
) | [static] |
translate from single order
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value |
Definition at line 285 of file Alphabet.h.
static void translate_from_single_order | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val, | |||
int32_t | gap | |||
) | [static] |
translate from single order
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value | |
gap | gap |
Definition at line 378 of file Alphabet.h.
static void translate_from_single_order_reversed | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val | |||
) | [static] |
translate from single order reversed
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value |
Definition at line 331 of file Alphabet.h.
static void translate_from_single_order_reversed | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val, | |||
int32_t | gap | |||
) | [static] |
translate from single order reversed
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value | |
gap | gap |
Definition at line 449 of file Alphabet.h.
alphabet
Definition at line 550 of file Alphabet.h.
const char * alphabet_names [static] |
{ "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN", "SNP", "RAWSNP"}
alphabet names
Definition at line 535 of file Alphabet.h.
const uint8_t B_0 = 4 [static] |
B_0
Definition at line 531 of file Alphabet.h.
const uint8_t B_A = 0 [static] |
B_A
Definition at line 523 of file Alphabet.h.
const uint8_t B_C = 1 [static] |
B_C
Definition at line 525 of file Alphabet.h.
const uint8_t B_G = 2 [static] |
B_G
Definition at line 527 of file Alphabet.h.
const uint8_t B_T = 3 [static] |
B_T
Definition at line 529 of file Alphabet.h.
int64_t histogram[1<< (sizeof(uint8_t)*8)] [protected] |
histogram
Definition at line 562 of file Alphabet.h.
uint8_t maptable_to_bin[1<< (sizeof(uint8_t)*8)] [protected] |
maptable to bin
Definition at line 558 of file Alphabet.h.
uint8_t maptable_to_char[1<< (sizeof(uint8_t)*8)] [protected] |
maptable to char
Definition at line 560 of file Alphabet.h.
const uint8_t MAPTABLE_UNDEF = 0xff [static] |
MAPTABLE UNDEF
Definition at line 533 of file Alphabet.h.
int32_t num_bits [protected] |
number of bits
Definition at line 554 of file Alphabet.h.
int32_t num_symbols [protected] |
number of symbols
Definition at line 552 of file Alphabet.h.
bool valid_chars[1<< (sizeof(uint8_t)*8)] [protected] |
valid chars
Definition at line 556 of file Alphabet.h.