Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include <shogun/base/SGObject.h>
00015 #include <shogun/lib/common.h>
00016
00017 namespace shogun
00018 {
00020 enum EAlphabet
00021 {
00023 DNA=0,
00024
00026 RAWDNA=1,
00027
00029 RNA=2,
00030
00032 PROTEIN=3,
00033
00034
00035 BINARY=4,
00036
00038 ALPHANUM=5,
00039
00041 CUBE=6,
00042
00044 RAWBYTE=7,
00045
00047 IUPAC_NUCLEIC_ACID=8,
00048
00050 IUPAC_AMINO_ACID=9,
00051
00053 NONE=10,
00054
00056 DIGIT=11,
00057
00059 DIGIT2=12,
00060
00062 RAWDIGIT=13,
00063
00065 RAWDIGIT2=14,
00066
00068 UNKNOWN=15,
00069
00071 SNP=16,
00072
00074 RAWSNP=17
00075 };
00076
00077
00088 class CAlphabet : public CSGObject
00089 {
00090 public:
00091
00095 CAlphabet();
00096
00102 CAlphabet(char* alpha, int32_t len);
00103
00108 CAlphabet(EAlphabet alpha);
00109
00114 CAlphabet(CAlphabet* alpha);
00115 virtual ~CAlphabet();
00116
00121 bool set_alphabet(EAlphabet alpha);
00122
00127 inline EAlphabet get_alphabet() const
00128 {
00129 return alphabet;
00130 }
00131
00136 inline int32_t get_num_symbols() const
00137 {
00138 return num_symbols;
00139 }
00140
00146 inline int32_t get_num_bits() const
00147 {
00148 return num_bits;
00149 }
00150
00156 inline uint8_t remap_to_bin(uint8_t c)
00157 {
00158 return maptable_to_bin[c];
00159 }
00160
00166 inline uint8_t remap_to_char(uint8_t c)
00167 {
00168 return maptable_to_char[c];
00169 }
00170
00172 void clear_histogram();
00173
00179 template <class T>
00180 void add_string_to_histogram(T* p, int64_t len)
00181 {
00182 for (int64_t i=0; i<len; i++)
00183 add_byte_to_histogram((uint8_t) (p[i]));
00184 }
00185
00190 inline void add_byte_to_histogram(uint8_t p)
00191 {
00192 histogram[p]++;
00193 }
00194
00196 void print_histogram();
00197
00202 SGVector<int64_t> get_histogram();
00203
00210 bool check_alphabet(bool print_error=true);
00211
00218 inline bool is_valid(uint8_t c)
00219 {
00220 return valid_chars[c];
00221 }
00222
00228 bool check_alphabet_size(bool print_error=true);
00229
00234 int32_t get_num_symbols_in_histogram();
00235
00240 int32_t get_max_value_in_histogram();
00241
00248 int32_t get_num_bits_in_histogram();
00249
00254 static const char* get_alphabet_name(EAlphabet alphabet);
00255
00256
00258 inline virtual const char* get_name() const { return "Alphabet"; }
00259
00268 template <class ST>
00269 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00270
00279 template <class ST>
00280 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00281
00291 template <class ST>
00292 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00293
00303 template <class ST>
00304 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00305
00306 private:
00309 void init();
00310
00311 protected:
00313 void init_map_table();
00314
00319 void copy_histogram(CAlphabet* src);
00320
00321 public:
00323 static const uint8_t B_A;
00325 static const uint8_t B_C;
00327 static const uint8_t B_G;
00329 static const uint8_t B_T;
00331 static const uint8_t B_0;
00333 static const uint8_t MAPTABLE_UNDEF;
00335 static const char* alphabet_names[18];
00336
00337 protected:
00346 virtual void load_serializable_post() throw (ShogunException);
00347
00348 protected:
00350 EAlphabet alphabet;
00352 int32_t num_symbols;
00354 int32_t num_bits;
00356 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00358 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00360 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00362 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00363 };
00364 }
00365 #endif