Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024     DNA=0,
00025 
00027     RAWDNA=1,
00028 
00030     RNA=2,
00031 
00033     PROTEIN=3,
00034 
00035     // BINARY just 0 and 1
00036     BINARY=4,
00037 
00039     ALPHANUM=5,
00040 
00042     CUBE=6,
00043 
00045     RAWBYTE=7,
00046 
00048     IUPAC_NUCLEIC_ACID=8,
00049 
00051     IUPAC_AMINO_ACID=9,
00052 
00054     NONE=10,
00055 
00057     DIGIT=11,
00058 
00060     DIGIT2=12,
00061 
00063     RAWDIGIT=13,
00064 
00066     RAWDIGIT2=14,
00067 
00069     UNKNOWN=15,
00070 
00072     SNP=16,
00073 
00075     RAWSNP=17
00076 };
00077 
00078 
00089 class CAlphabet : public CSGObject
00090 {
00091     public:
00092 
00096         CAlphabet();
00097 
00103         CAlphabet(char* alpha, int32_t len);
00104 
00109         CAlphabet(EAlphabet alpha);
00110 
00115         CAlphabet(CAlphabet* alpha);
00116         virtual ~CAlphabet();
00117 
00122         bool set_alphabet(EAlphabet alpha);
00123 
00128         inline EAlphabet get_alphabet() const
00129         {
00130             return alphabet;
00131         }
00132 
00137         inline int32_t get_num_symbols() const
00138         {
00139             return num_symbols;
00140         }
00141 
00147         inline int32_t get_num_bits() const
00148         {
00149             return num_bits;
00150         }
00151 
00157         inline uint8_t remap_to_bin(uint8_t c)
00158         {
00159             return maptable_to_bin[c];
00160         }
00161 
00167         inline uint8_t remap_to_char(uint8_t c)
00168         {
00169             return maptable_to_char[c];
00170         }
00171 
00173         void clear_histogram();
00174 
00180         template <class T>
00181         void add_string_to_histogram(T* p, int64_t len)
00182         {
00183             for (int64_t i=0; i<len; i++)
00184                 add_byte_to_histogram((uint8_t) (p[i]));
00185         }
00186 
00191         inline void add_byte_to_histogram(uint8_t p)
00192         {
00193             histogram[p]++;
00194         }
00195 
00197         void print_histogram();
00198 
00204         inline void get_hist(int64_t** h, int32_t* len)
00205         {
00206             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00207             ASSERT(h && len);
00208             *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00209             ASSERT(*h);
00210             *len=hist_size;
00211             ASSERT(*len);
00212             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00213         }
00214 
00216         inline const int64_t* get_histogram()
00217         {
00218             return &histogram[0];
00219         }
00220 
00227         bool check_alphabet(bool print_error=true);
00228 
00235         inline bool is_valid(uint8_t c)
00236         {
00237             return valid_chars[c];
00238         }
00239 
00245         bool check_alphabet_size(bool print_error=true);
00246 
00251         int32_t get_num_symbols_in_histogram();
00252 
00257         int32_t get_max_value_in_histogram();
00258 
00265         int32_t get_num_bits_in_histogram();
00266 
00271         static const char* get_alphabet_name(EAlphabet alphabet);
00272 
00273 
00275         inline virtual const char* get_name() const { return "Alphabet"; }
00276 
00285         template <class ST>
00286         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00287         {
00288             int32_t i,j;
00289             ST value=0;
00290 
00291             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00292             {
00293                 value=0;
00294                 for (j=i; j>=i-p_order+1; j--)
00295                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00296 
00297                 obs[i]= (ST) value;
00298             }
00299 
00300             for (i=p_order-2;i>=0;i--)
00301             {
00302                 if (i>=sequence_length)
00303                     continue;
00304 
00305                 value=0;
00306                 for (j=i; j>=i-p_order+1; j--)
00307                 {
00308                     value= (value >> max_val);
00309                     if (j>=0 && j<sequence_length)
00310                         value|=obs[j] << (max_val * (p_order-1));
00311                 }
00312                 obs[i]=value;
00313             }
00314 
00315             // TODO we should get rid of this loop!
00316             if (start>0)
00317             {
00318                 for (i=start; i<sequence_length; i++)
00319                     obs[i-start]=obs[i];
00320             }
00321         }
00322 
00331         template <class ST>
00332         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00333         {
00334             int32_t i,j;
00335             ST value=0;
00336 
00337             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00338             {
00339                 value=0;
00340                 for (j=i; j>=i-p_order+1; j--)
00341                     value= (value << max_val) | obs[j];
00342 
00343                 obs[i]= (ST) value;
00344             }
00345 
00346             for (i=p_order-2;i>=0;i--)
00347             {
00348                 if (i>=sequence_length)
00349                     continue;
00350 
00351                 value=0;
00352                 for (j=i; j>=i-p_order+1; j--)
00353                 {
00354                     value= (value << max_val);
00355                     if (j>=0 && j<sequence_length)
00356                         value|=obs[j];
00357                 }
00358                 obs[i]=value;
00359             }
00360 
00361             // TODO we should get rid of this loop!
00362             if (start>0)
00363             {
00364                 for (i=start; i<sequence_length; i++)
00365                     obs[i-start]=obs[i];
00366             }
00367         }
00368 
00378         template <class ST>
00379         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00380         {
00381             ASSERT(gap>=0);
00382 
00383             const int32_t start_gap=(p_order-gap)/2;
00384             const int32_t end_gap=start_gap+gap;
00385 
00386             int32_t i,j;
00387             ST value=0;
00388 
00389             // almost all positions
00390             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00391             {
00392                 value=0;
00393                 for (j=i; j>=i-p_order+1; j--)
00394                 {
00395                     if (i-j<start_gap)
00396                     {
00397                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00398                     }
00399                     else if (i-j>=end_gap)
00400                     {
00401                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00402                     }
00403                 }
00404                 obs[i]= (ST) value;
00405             }
00406 
00407             // the remaining `order` positions
00408             for (i=p_order-2;i>=0;i--)
00409             {
00410                 if (i>=sequence_length)
00411                     continue;
00412 
00413                 value=0;
00414                 for (j=i; j>=i-p_order+1; j--)
00415                 {
00416                     if (i-j<start_gap)
00417                     {
00418                         value= (value >> max_val);
00419                         if (j>=0 && j<sequence_length)
00420                             value|=obs[j] << (max_val * (p_order-1-gap));
00421                     }
00422                     else if (i-j>=end_gap)
00423                     {
00424                         value= (value >> max_val);
00425                         if (j>=0 && j<sequence_length)
00426                             value|=obs[j] << (max_val * (p_order-1-gap));
00427                     }
00428                 }
00429                 obs[i]=value;
00430             }
00431 
00432             // TODO we should get rid of this loop!
00433             if (start>0)
00434             {
00435                 for (i=start; i<sequence_length; i++)
00436                     obs[i-start]=obs[i];
00437             }
00438         }
00439 
00449         template <class ST>
00450         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00451         {
00452             ASSERT(gap>=0);
00453 
00454             const int32_t start_gap=(p_order-gap)/2;
00455             const int32_t end_gap=start_gap+gap;
00456 
00457             int32_t i,j;
00458             ST value=0;
00459 
00460             // almost all positions
00461             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00462             {
00463                 value=0;
00464                 for (j=i; j>=i-p_order+1; j--)
00465                 {
00466                     if (i-j<start_gap)
00467                         value= (value << max_val) | obs[j];
00468                     else if (i-j>=end_gap)
00469                         value= (value << max_val) | obs[j];
00470                 }
00471                 obs[i]= (ST) value;
00472             }
00473 
00474             // the remaining `order` positions
00475             for (i=p_order-2;i>=0;i--)
00476             {
00477                 if (i>=sequence_length)
00478                     continue;
00479 
00480                 value=0;
00481                 for (j=i; j>=i-p_order+1; j--)
00482                 {
00483                     if (i-j<start_gap)
00484                     {
00485                         value= value << max_val;
00486                         if (j>=0 && j<sequence_length)
00487                             value|=obs[j];
00488                     }
00489                     else if (i-j>=end_gap)
00490                     {
00491                         value= value << max_val;
00492                         if (j>=0 && j<sequence_length)
00493                             value|=obs[j];
00494                     }
00495                 }
00496                 obs[i]=value;
00497             }
00498 
00499             // TODO we should get rid of this loop!
00500             if (start>0)
00501             {
00502                 for (i=start; i<sequence_length; i++)
00503                     obs[i-start]=obs[i];
00504             }
00505         }
00506 
00507     private:
00510         void init();
00511 
00512     protected:
00514         void init_map_table();
00515 
00520         void copy_histogram(CAlphabet* src);
00521 
00522     public:
00524         static const uint8_t B_A;
00526         static const uint8_t B_C;
00528         static const uint8_t B_G;
00530         static const uint8_t B_T;
00532         static const uint8_t B_0;
00534         static const uint8_t MAPTABLE_UNDEF;
00536         static const char* alphabet_names[18];
00537 
00538     protected:
00547         virtual void load_serializable_post(void) throw (ShogunException);
00548 
00549     protected:
00551         EAlphabet alphabet;
00553         int32_t num_symbols;
00555         int32_t num_bits;
00557         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00559         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00561         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00563         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00564 };
00565 
00566 
00567 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00568 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00569 {
00570 }
00571 
00572 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00573 {
00574 }
00575 
00576 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00577 {
00578 }
00579 
00580 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00581 {
00582 }
00583 
00584 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587 
00588 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591 #endif
00592 
00593 }
00594 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation