Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include <shogun/base/SGObject.h>
00015 #include <shogun/mathematics/Math.h>
00016 #include <shogun/lib/common.h>
00017 
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024     DNA=0,
00025 
00027     RAWDNA=1,
00028 
00030     RNA=2,
00031 
00033     PROTEIN=3,
00034 
00035     // BINARY just 0 and 1
00036     BINARY=4,
00037 
00039     ALPHANUM=5,
00040 
00042     CUBE=6,
00043 
00045     RAWBYTE=7,
00046 
00048     IUPAC_NUCLEIC_ACID=8,
00049 
00051     IUPAC_AMINO_ACID=9,
00052 
00054     NONE=10,
00055 
00057     DIGIT=11,
00058 
00060     DIGIT2=12,
00061 
00063     RAWDIGIT=13,
00064 
00066     RAWDIGIT2=14,
00067 
00069     UNKNOWN=15,
00070 
00072     SNP=16,
00073 
00075     RAWSNP=17
00076 };
00077 
00078 
00089 class CAlphabet : public CSGObject
00090 {
00091     public:
00092 
00096         CAlphabet();
00097 
00103         CAlphabet(char* alpha, int32_t len);
00104 
00109         CAlphabet(EAlphabet alpha);
00110 
00115         CAlphabet(CAlphabet* alpha);
00116         virtual ~CAlphabet();
00117 
00122         bool set_alphabet(EAlphabet alpha);
00123 
00128         inline EAlphabet get_alphabet() const
00129         {
00130             return alphabet;
00131         }
00132 
00137         inline int32_t get_num_symbols() const
00138         {
00139             return num_symbols;
00140         }
00141 
00147         inline int32_t get_num_bits() const
00148         {
00149             return num_bits;
00150         }
00151 
00157         inline uint8_t remap_to_bin(uint8_t c)
00158         {
00159             return maptable_to_bin[c];
00160         }
00161 
00167         inline uint8_t remap_to_char(uint8_t c)
00168         {
00169             return maptable_to_char[c];
00170         }
00171 
00173         void clear_histogram();
00174 
00180         template <class T>
00181         void add_string_to_histogram(T* p, int64_t len)
00182         {
00183             for (int64_t i=0; i<len; i++)
00184                 add_byte_to_histogram((uint8_t) (p[i]));
00185         }
00186 
00191         inline void add_byte_to_histogram(uint8_t p)
00192         {
00193             histogram[p]++;
00194         }
00195 
00197         void print_histogram();
00198 
00204         inline void get_hist(int64_t** h, int32_t* len)
00205         {
00206             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00207             ASSERT(h && len);
00208             *h= SG_MALLOC(int64_t, hist_size);
00209             *len=hist_size;
00210             ASSERT(*len);
00211             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00212         }
00213 
00215         inline const int64_t* get_histogram()
00216         {
00217             return &histogram[0];
00218         }
00219 
00226         bool check_alphabet(bool print_error=true);
00227 
00234         inline bool is_valid(uint8_t c)
00235         {
00236             return valid_chars[c];
00237         }
00238 
00244         bool check_alphabet_size(bool print_error=true);
00245 
00250         int32_t get_num_symbols_in_histogram();
00251 
00256         int32_t get_max_value_in_histogram();
00257 
00264         int32_t get_num_bits_in_histogram();
00265 
00270         static const char* get_alphabet_name(EAlphabet alphabet);
00271 
00272 
00274         inline virtual const char* get_name() const { return "Alphabet"; }
00275 
00284         template <class ST>
00285         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00286         {
00287             int32_t i,j;
00288             ST value=0;
00289 
00290             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00291             {
00292                 value=0;
00293                 for (j=i; j>=i-p_order+1; j--)
00294                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00295 
00296                 obs[i]= (ST) value;
00297             }
00298 
00299             for (i=p_order-2;i>=0;i--)
00300             {
00301                 if (i>=sequence_length)
00302                     continue;
00303 
00304                 value=0;
00305                 for (j=i; j>=i-p_order+1; j--)
00306                 {
00307                     value= (value >> max_val);
00308                     if (j>=0 && j<sequence_length)
00309                         value|=obs[j] << (max_val * (p_order-1));
00310                 }
00311                 obs[i]=value;
00312             }
00313 
00314             // TODO we should get rid of this loop!
00315             if (start>0)
00316             {
00317                 for (i=start; i<sequence_length; i++)
00318                     obs[i-start]=obs[i];
00319             }
00320         }
00321 
00330         template <class ST>
00331         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00332         {
00333             int32_t i,j;
00334             ST value=0;
00335 
00336             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00337             {
00338                 value=0;
00339                 for (j=i; j>=i-p_order+1; j--)
00340                     value= (value << max_val) | obs[j];
00341 
00342                 obs[i]= (ST) value;
00343             }
00344 
00345             for (i=p_order-2;i>=0;i--)
00346             {
00347                 if (i>=sequence_length)
00348                     continue;
00349 
00350                 value=0;
00351                 for (j=i; j>=i-p_order+1; j--)
00352                 {
00353                     value= (value << max_val);
00354                     if (j>=0 && j<sequence_length)
00355                         value|=obs[j];
00356                 }
00357                 obs[i]=value;
00358             }
00359 
00360             // TODO we should get rid of this loop!
00361             if (start>0)
00362             {
00363                 for (i=start; i<sequence_length; i++)
00364                     obs[i-start]=obs[i];
00365             }
00366         }
00367 
00377         template <class ST>
00378         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00379         {
00380             ASSERT(gap>=0);
00381 
00382             const int32_t start_gap=(p_order-gap)/2;
00383             const int32_t end_gap=start_gap+gap;
00384 
00385             int32_t i,j;
00386             ST value=0;
00387 
00388             // almost all positions
00389             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00390             {
00391                 value=0;
00392                 for (j=i; j>=i-p_order+1; j--)
00393                 {
00394                     if (i-j<start_gap)
00395                     {
00396                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00397                     }
00398                     else if (i-j>=end_gap)
00399                     {
00400                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00401                     }
00402                 }
00403                 obs[i]= (ST) value;
00404             }
00405 
00406             // the remaining `order` positions
00407             for (i=p_order-2;i>=0;i--)
00408             {
00409                 if (i>=sequence_length)
00410                     continue;
00411 
00412                 value=0;
00413                 for (j=i; j>=i-p_order+1; j--)
00414                 {
00415                     if (i-j<start_gap)
00416                     {
00417                         value= (value >> max_val);
00418                         if (j>=0 && j<sequence_length)
00419                             value|=obs[j] << (max_val * (p_order-1-gap));
00420                     }
00421                     else if (i-j>=end_gap)
00422                     {
00423                         value= (value >> max_val);
00424                         if (j>=0 && j<sequence_length)
00425                             value|=obs[j] << (max_val * (p_order-1-gap));
00426                     }
00427                 }
00428                 obs[i]=value;
00429             }
00430 
00431             // TODO we should get rid of this loop!
00432             if (start>0)
00433             {
00434                 for (i=start; i<sequence_length; i++)
00435                     obs[i-start]=obs[i];
00436             }
00437         }
00438 
00448         template <class ST>
00449         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00450         {
00451             ASSERT(gap>=0);
00452 
00453             const int32_t start_gap=(p_order-gap)/2;
00454             const int32_t end_gap=start_gap+gap;
00455 
00456             int32_t i,j;
00457             ST value=0;
00458 
00459             // almost all positions
00460             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00461             {
00462                 value=0;
00463                 for (j=i; j>=i-p_order+1; j--)
00464                 {
00465                     if (i-j<start_gap)
00466                         value= (value << max_val) | obs[j];
00467                     else if (i-j>=end_gap)
00468                         value= (value << max_val) | obs[j];
00469                 }
00470                 obs[i]= (ST) value;
00471             }
00472 
00473             // the remaining `order` positions
00474             for (i=p_order-2;i>=0;i--)
00475             {
00476                 if (i>=sequence_length)
00477                     continue;
00478 
00479                 value=0;
00480                 for (j=i; j>=i-p_order+1; j--)
00481                 {
00482                     if (i-j<start_gap)
00483                     {
00484                         value= value << max_val;
00485                         if (j>=0 && j<sequence_length)
00486                             value|=obs[j];
00487                     }
00488                     else if (i-j>=end_gap)
00489                     {
00490                         value= value << max_val;
00491                         if (j>=0 && j<sequence_length)
00492                             value|=obs[j];
00493                     }
00494                 }
00495                 obs[i]=value;
00496             }
00497 
00498             // TODO we should get rid of this loop!
00499             if (start>0)
00500             {
00501                 for (i=start; i<sequence_length; i++)
00502                     obs[i-start]=obs[i];
00503             }
00504         }
00505 
00506     private:
00509         void init();
00510 
00511     protected:
00513         void init_map_table();
00514 
00519         void copy_histogram(CAlphabet* src);
00520 
00521     public:
00523         static const uint8_t B_A;
00525         static const uint8_t B_C;
00527         static const uint8_t B_G;
00529         static const uint8_t B_T;
00531         static const uint8_t B_0;
00533         static const uint8_t MAPTABLE_UNDEF;
00535         static const char* alphabet_names[18];
00536 
00537     protected:
00546         virtual void load_serializable_post(void) throw (ShogunException);
00547 
00548     protected:
00550         EAlphabet alphabet;
00552         int32_t num_symbols;
00554         int32_t num_bits;
00556         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00558         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00560         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00562         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00563 };
00564 
00565 
00566 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00567 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00568 {
00569 }
00570 
00571 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00572 {
00573 }
00574 
00575 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00576 {
00577 }
00578 
00579 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00580 {
00581 }
00582 
00583 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00584 {
00585 }
00586 
00587 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00588 {
00589 }
00590 #endif
00591 
00592 }
00593 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation