00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024 DNA=0,
00025
00027 RAWDNA=1,
00028
00030 RNA=2,
00031
00033 PROTEIN=3,
00034
00035
00036 BINARY=4,
00037
00039 ALPHANUM=5,
00040
00042 CUBE=6,
00043
00045 RAWBYTE=7,
00046
00048 IUPAC_NUCLEIC_ACID=8,
00049
00051 IUPAC_AMINO_ACID=9,
00052
00054 NONE=10,
00055
00057 DIGIT=11,
00058
00060 DIGIT2=12,
00061
00063 RAWDIGIT=13,
00064
00066 RAWDIGIT2=14,
00067
00069 UNKNOWN=15,
00070
00072 SNP=16,
00073
00075 RAWSNP=17
00076 };
00077
00078
00089 class CAlphabet : public CSGObject
00090 {
00091 public:
00092
00096 CAlphabet();
00097
00103 CAlphabet(char* alpha, int32_t len);
00104
00109 CAlphabet(EAlphabet alpha);
00110
00115 CAlphabet(CAlphabet* alpha);
00116 virtual ~CAlphabet();
00117
00122 bool set_alphabet(EAlphabet alpha);
00123
00128 inline EAlphabet get_alphabet() const
00129 {
00130 return alphabet;
00131 }
00132
00137 inline int32_t get_num_symbols() const
00138 {
00139 return num_symbols;
00140 }
00141
00147 inline int32_t get_num_bits() const
00148 {
00149 return num_bits;
00150 }
00151
00157 inline uint8_t remap_to_bin(uint8_t c)
00158 {
00159 return maptable_to_bin[c];
00160 }
00161
00167 inline uint8_t remap_to_char(uint8_t c)
00168 {
00169 return maptable_to_char[c];
00170 }
00171
00173 void clear_histogram();
00174
00180 template <class T>
00181 void add_string_to_histogram(T* p, int64_t len)
00182 {
00183 for (int64_t i=0; i<len; i++)
00184 add_byte_to_histogram((uint8_t) (p[i]));
00185 }
00186
00191 inline void add_byte_to_histogram(uint8_t p)
00192 {
00193 histogram[p]++;
00194 }
00195
00197 void print_histogram();
00198
00204 inline void get_hist(int64_t** h, int32_t* len)
00205 {
00206 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00207 ASSERT(h && len);
00208 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00209 ASSERT(*h);
00210 *len=hist_size;
00211 ASSERT(*len);
00212 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00213 }
00214
00216 inline const int64_t* get_histogram()
00217 {
00218 return &histogram[0];
00219 }
00220
00227 bool check_alphabet(bool print_error=true);
00228
00235 inline bool is_valid(uint8_t c)
00236 {
00237 return valid_chars[c];
00238 }
00239
00245 bool check_alphabet_size(bool print_error=true);
00246
00251 int32_t get_num_symbols_in_histogram();
00252
00257 int32_t get_max_value_in_histogram();
00258
00265 int32_t get_num_bits_in_histogram();
00266
00271 static const char* get_alphabet_name(EAlphabet alphabet);
00272
00273
00275 inline virtual const char* get_name() const { return "Alphabet"; }
00276
00285 template <class ST>
00286 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00287 {
00288 int32_t i,j;
00289 ST value=0;
00290
00291 for (i=sequence_length-1; i>= p_order-1; i--)
00292 {
00293 value=0;
00294 for (j=i; j>=i-p_order+1; j--)
00295 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00296
00297 obs[i]= (ST) value;
00298 }
00299
00300 for (i=p_order-2;i>=0;i--)
00301 {
00302 if (i>=sequence_length)
00303 continue;
00304
00305 value=0;
00306 for (j=i; j>=i-p_order+1; j--)
00307 {
00308 value= (value >> max_val);
00309 if (j>=0 && j<sequence_length)
00310 value|=obs[j] << (max_val * (p_order-1));
00311 }
00312 obs[i]=value;
00313 }
00314
00315
00316 if (start>0)
00317 {
00318 for (i=start; i<sequence_length; i++)
00319 obs[i-start]=obs[i];
00320 }
00321 }
00322
00331 template <class ST>
00332 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00333 {
00334 int32_t i,j;
00335 ST value=0;
00336
00337 for (i=sequence_length-1; i>= p_order-1; i--)
00338 {
00339 value=0;
00340 for (j=i; j>=i-p_order+1; j--)
00341 value= (value << max_val) | obs[j];
00342
00343 obs[i]= (ST) value;
00344 }
00345
00346 for (i=p_order-2;i>=0;i--)
00347 {
00348 if (i>=sequence_length)
00349 continue;
00350
00351 value=0;
00352 for (j=i; j>=i-p_order+1; j--)
00353 {
00354 value= (value << max_val);
00355 if (j>=0 && j<sequence_length)
00356 value|=obs[j];
00357 }
00358 obs[i]=value;
00359 }
00360
00361
00362 if (start>0)
00363 {
00364 for (i=start; i<sequence_length; i++)
00365 obs[i-start]=obs[i];
00366 }
00367 }
00368
00378 template <class ST>
00379 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00380 {
00381 ASSERT(gap>=0);
00382
00383 const int32_t start_gap=(p_order-gap)/2;
00384 const int32_t end_gap=start_gap+gap;
00385
00386 int32_t i,j;
00387 ST value=0;
00388
00389
00390 for (i=sequence_length-1; i>=p_order-1; i--)
00391 {
00392 value=0;
00393 for (j=i; j>=i-p_order+1; j--)
00394 {
00395 if (i-j<start_gap)
00396 {
00397 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00398 }
00399 else if (i-j>=end_gap)
00400 {
00401 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00402 }
00403 }
00404 obs[i]= (ST) value;
00405 }
00406
00407
00408 for (i=p_order-2;i>=0;i--)
00409 {
00410 if (i>=sequence_length)
00411 continue;
00412
00413 value=0;
00414 for (j=i; j>=i-p_order+1; j--)
00415 {
00416 if (i-j<start_gap)
00417 {
00418 value= (value >> max_val);
00419 if (j>=0 && j<sequence_length)
00420 value|=obs[j] << (max_val * (p_order-1-gap));
00421 }
00422 else if (i-j>=end_gap)
00423 {
00424 value= (value >> max_val);
00425 if (j>=0 && j<sequence_length)
00426 value|=obs[j] << (max_val * (p_order-1-gap));
00427 }
00428 }
00429 obs[i]=value;
00430 }
00431
00432
00433 if (start>0)
00434 {
00435 for (i=start; i<sequence_length; i++)
00436 obs[i-start]=obs[i];
00437 }
00438 }
00439
00449 template <class ST>
00450 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00451 {
00452 ASSERT(gap>=0);
00453
00454 const int32_t start_gap=(p_order-gap)/2;
00455 const int32_t end_gap=start_gap+gap;
00456
00457 int32_t i,j;
00458 ST value=0;
00459
00460
00461 for (i=sequence_length-1; i>=p_order-1; i--)
00462 {
00463 value=0;
00464 for (j=i; j>=i-p_order+1; j--)
00465 {
00466 if (i-j<start_gap)
00467 value= (value << max_val) | obs[j];
00468 else if (i-j>=end_gap)
00469 value= (value << max_val) | obs[j];
00470 }
00471 obs[i]= (ST) value;
00472 }
00473
00474
00475 for (i=p_order-2;i>=0;i--)
00476 {
00477 if (i>=sequence_length)
00478 continue;
00479
00480 value=0;
00481 for (j=i; j>=i-p_order+1; j--)
00482 {
00483 if (i-j<start_gap)
00484 {
00485 value= value << max_val;
00486 if (j>=0 && j<sequence_length)
00487 value|=obs[j];
00488 }
00489 else if (i-j>=end_gap)
00490 {
00491 value= value << max_val;
00492 if (j>=0 && j<sequence_length)
00493 value|=obs[j];
00494 }
00495 }
00496 obs[i]=value;
00497 }
00498
00499
00500 if (start>0)
00501 {
00502 for (i=start; i<sequence_length; i++)
00503 obs[i-start]=obs[i];
00504 }
00505 }
00506
00507 private:
00510 void init();
00511
00512 protected:
00514 void init_map_table();
00515
00520 void copy_histogram(CAlphabet* src);
00521
00522 public:
00524 static const uint8_t B_A;
00526 static const uint8_t B_C;
00528 static const uint8_t B_G;
00530 static const uint8_t B_T;
00532 static const uint8_t B_0;
00534 static const uint8_t MAPTABLE_UNDEF;
00536 static const char* alphabet_names[18];
00537
00538 protected:
00547 virtual void load_serializable_post(void) throw (ShogunException);
00548
00549 protected:
00551 EAlphabet alphabet;
00553 int32_t num_symbols;
00555 int32_t num_bits;
00557 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00559 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00561 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00563 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00564 };
00565
00566
00567 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00568 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00569 {
00570 }
00571
00572 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00573 {
00574 }
00575
00576 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00577 {
00578 }
00579
00580 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00581 {
00582 }
00583
00584 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587
00588 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591 #endif
00592
00593 }
00594 #endif