00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include <shogun/base/SGObject.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/lib/common.h>
00017
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024 DNA=0,
00025
00027 RAWDNA=1,
00028
00030 RNA=2,
00031
00033 PROTEIN=3,
00034
00035
00036 BINARY=4,
00037
00039 ALPHANUM=5,
00040
00042 CUBE=6,
00043
00045 RAWBYTE=7,
00046
00048 IUPAC_NUCLEIC_ACID=8,
00049
00051 IUPAC_AMINO_ACID=9,
00052
00054 NONE=10,
00055
00057 DIGIT=11,
00058
00060 DIGIT2=12,
00061
00063 RAWDIGIT=13,
00064
00066 RAWDIGIT2=14,
00067
00069 UNKNOWN=15,
00070
00072 SNP=16,
00073
00075 RAWSNP=17
00076 };
00077
00078
00089 class CAlphabet : public CSGObject
00090 {
00091 public:
00092
00096 CAlphabet();
00097
00103 CAlphabet(char* alpha, int32_t len);
00104
00109 CAlphabet(EAlphabet alpha);
00110
00115 CAlphabet(CAlphabet* alpha);
00116 virtual ~CAlphabet();
00117
00122 bool set_alphabet(EAlphabet alpha);
00123
00128 inline EAlphabet get_alphabet() const
00129 {
00130 return alphabet;
00131 }
00132
00137 inline int32_t get_num_symbols() const
00138 {
00139 return num_symbols;
00140 }
00141
00147 inline int32_t get_num_bits() const
00148 {
00149 return num_bits;
00150 }
00151
00157 inline uint8_t remap_to_bin(uint8_t c)
00158 {
00159 return maptable_to_bin[c];
00160 }
00161
00167 inline uint8_t remap_to_char(uint8_t c)
00168 {
00169 return maptable_to_char[c];
00170 }
00171
00173 void clear_histogram();
00174
00180 template <class T>
00181 void add_string_to_histogram(T* p, int64_t len)
00182 {
00183 for (int64_t i=0; i<len; i++)
00184 add_byte_to_histogram((uint8_t) (p[i]));
00185 }
00186
00191 inline void add_byte_to_histogram(uint8_t p)
00192 {
00193 histogram[p]++;
00194 }
00195
00197 void print_histogram();
00198
00204 inline void get_hist(int64_t** h, int32_t* len)
00205 {
00206 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00207 ASSERT(h && len);
00208 *h= SG_MALLOC(int64_t, hist_size);
00209 *len=hist_size;
00210 ASSERT(*len);
00211 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00212 }
00213
00215 inline const int64_t* get_histogram()
00216 {
00217 return &histogram[0];
00218 }
00219
00226 bool check_alphabet(bool print_error=true);
00227
00234 inline bool is_valid(uint8_t c)
00235 {
00236 return valid_chars[c];
00237 }
00238
00244 bool check_alphabet_size(bool print_error=true);
00245
00250 int32_t get_num_symbols_in_histogram();
00251
00256 int32_t get_max_value_in_histogram();
00257
00264 int32_t get_num_bits_in_histogram();
00265
00270 static const char* get_alphabet_name(EAlphabet alphabet);
00271
00272
00274 inline virtual const char* get_name() const { return "Alphabet"; }
00275
00284 template <class ST>
00285 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00286 {
00287 int32_t i,j;
00288 ST value=0;
00289
00290 for (i=sequence_length-1; i>= p_order-1; i--)
00291 {
00292 value=0;
00293 for (j=i; j>=i-p_order+1; j--)
00294 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00295
00296 obs[i]= (ST) value;
00297 }
00298
00299 for (i=p_order-2;i>=0;i--)
00300 {
00301 if (i>=sequence_length)
00302 continue;
00303
00304 value=0;
00305 for (j=i; j>=i-p_order+1; j--)
00306 {
00307 value= (value >> max_val);
00308 if (j>=0 && j<sequence_length)
00309 value|=obs[j] << (max_val * (p_order-1));
00310 }
00311 obs[i]=value;
00312 }
00313
00314
00315 if (start>0)
00316 {
00317 for (i=start; i<sequence_length; i++)
00318 obs[i-start]=obs[i];
00319 }
00320 }
00321
00330 template <class ST>
00331 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00332 {
00333 int32_t i,j;
00334 ST value=0;
00335
00336 for (i=sequence_length-1; i>= p_order-1; i--)
00337 {
00338 value=0;
00339 for (j=i; j>=i-p_order+1; j--)
00340 value= (value << max_val) | obs[j];
00341
00342 obs[i]= (ST) value;
00343 }
00344
00345 for (i=p_order-2;i>=0;i--)
00346 {
00347 if (i>=sequence_length)
00348 continue;
00349
00350 value=0;
00351 for (j=i; j>=i-p_order+1; j--)
00352 {
00353 value= (value << max_val);
00354 if (j>=0 && j<sequence_length)
00355 value|=obs[j];
00356 }
00357 obs[i]=value;
00358 }
00359
00360
00361 if (start>0)
00362 {
00363 for (i=start; i<sequence_length; i++)
00364 obs[i-start]=obs[i];
00365 }
00366 }
00367
00377 template <class ST>
00378 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00379 {
00380 ASSERT(gap>=0);
00381
00382 const int32_t start_gap=(p_order-gap)/2;
00383 const int32_t end_gap=start_gap+gap;
00384
00385 int32_t i,j;
00386 ST value=0;
00387
00388
00389 for (i=sequence_length-1; i>=p_order-1; i--)
00390 {
00391 value=0;
00392 for (j=i; j>=i-p_order+1; j--)
00393 {
00394 if (i-j<start_gap)
00395 {
00396 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00397 }
00398 else if (i-j>=end_gap)
00399 {
00400 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00401 }
00402 }
00403 obs[i]= (ST) value;
00404 }
00405
00406
00407 for (i=p_order-2;i>=0;i--)
00408 {
00409 if (i>=sequence_length)
00410 continue;
00411
00412 value=0;
00413 for (j=i; j>=i-p_order+1; j--)
00414 {
00415 if (i-j<start_gap)
00416 {
00417 value= (value >> max_val);
00418 if (j>=0 && j<sequence_length)
00419 value|=obs[j] << (max_val * (p_order-1-gap));
00420 }
00421 else if (i-j>=end_gap)
00422 {
00423 value= (value >> max_val);
00424 if (j>=0 && j<sequence_length)
00425 value|=obs[j] << (max_val * (p_order-1-gap));
00426 }
00427 }
00428 obs[i]=value;
00429 }
00430
00431
00432 if (start>0)
00433 {
00434 for (i=start; i<sequence_length; i++)
00435 obs[i-start]=obs[i];
00436 }
00437 }
00438
00448 template <class ST>
00449 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00450 {
00451 ASSERT(gap>=0);
00452
00453 const int32_t start_gap=(p_order-gap)/2;
00454 const int32_t end_gap=start_gap+gap;
00455
00456 int32_t i,j;
00457 ST value=0;
00458
00459
00460 for (i=sequence_length-1; i>=p_order-1; i--)
00461 {
00462 value=0;
00463 for (j=i; j>=i-p_order+1; j--)
00464 {
00465 if (i-j<start_gap)
00466 value= (value << max_val) | obs[j];
00467 else if (i-j>=end_gap)
00468 value= (value << max_val) | obs[j];
00469 }
00470 obs[i]= (ST) value;
00471 }
00472
00473
00474 for (i=p_order-2;i>=0;i--)
00475 {
00476 if (i>=sequence_length)
00477 continue;
00478
00479 value=0;
00480 for (j=i; j>=i-p_order+1; j--)
00481 {
00482 if (i-j<start_gap)
00483 {
00484 value= value << max_val;
00485 if (j>=0 && j<sequence_length)
00486 value|=obs[j];
00487 }
00488 else if (i-j>=end_gap)
00489 {
00490 value= value << max_val;
00491 if (j>=0 && j<sequence_length)
00492 value|=obs[j];
00493 }
00494 }
00495 obs[i]=value;
00496 }
00497
00498
00499 if (start>0)
00500 {
00501 for (i=start; i<sequence_length; i++)
00502 obs[i-start]=obs[i];
00503 }
00504 }
00505
00506 private:
00509 void init();
00510
00511 protected:
00513 void init_map_table();
00514
00519 void copy_histogram(CAlphabet* src);
00520
00521 public:
00523 static const uint8_t B_A;
00525 static const uint8_t B_C;
00527 static const uint8_t B_G;
00529 static const uint8_t B_T;
00531 static const uint8_t B_0;
00533 static const uint8_t MAPTABLE_UNDEF;
00535 static const char* alphabet_names[18];
00536
00537 protected:
00546 virtual void load_serializable_post(void) throw (ShogunException);
00547
00548 protected:
00550 EAlphabet alphabet;
00552 int32_t num_symbols;
00554 int32_t num_bits;
00556 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00558 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00560 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00562 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00563 };
00564
00565
00566 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00567 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00568 {
00569 }
00570
00571 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00572 {
00573 }
00574
00575 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00576 {
00577 }
00578
00579 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00580 {
00581 }
00582
00583 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00584 {
00585 }
00586
00587 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00588 {
00589 }
00590 #endif
00591
00592 }
00593 #endif