Alphabet.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include <shogun/features/Alphabet.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/base/Parameter.h>
00017 
00018 using namespace shogun;
00019 
00020 //define numbers for the bases
00021 const uint8_t CAlphabet::B_A=0;
00022 const uint8_t CAlphabet::B_C=1;
00023 const uint8_t CAlphabet::B_G=2;
00024 const uint8_t CAlphabet::B_T=3;
00025 const uint8_t CAlphabet::B_0=4;
00026 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00027 const char* CAlphabet::alphabet_names[18]={
00028     "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00029     "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00030     "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00031     "SNP", "RAWSNP"};
00032 
00033 
00034 CAlphabet::CAlphabet()
00035   : CSGObject()
00036 {
00037     init();
00038 }
00039 
00040 CAlphabet::CAlphabet(char* al, int32_t len)
00041 : CSGObject()
00042 {
00043     init();
00044 
00045     EAlphabet alpha=NONE;
00046 
00047     if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00048         alpha = DNA;
00049     else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00050         alpha = RAWDNA;
00051     else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00052         alpha = RNA;
00053     else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00054         alpha = PROTEIN;
00055     else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00056         alpha = BINARY;
00057     else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00058         alpha = ALPHANUM;
00059     else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00060         alpha = CUBE;
00061     else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00062         alpha = DIGIT2;
00063     else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00064         alpha = DIGIT;
00065     else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00066         alpha = RAWDIGIT2;
00067     else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00068         alpha = RAWDIGIT;
00069     else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00070         alpha = SNP;
00071     else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00072         alpha = RAWSNP;
00073     else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00074             (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00075         alpha = RAWBYTE;
00076     else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00077         alpha = IUPAC_NUCLEIC_ACID;
00078     else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00079         alpha = IUPAC_AMINO_ACID;
00080     else {
00081       SG_ERROR( "unknown alphabet %s\n", al);
00082    }
00083 
00084     set_alphabet(alpha);
00085 }
00086 
00087 CAlphabet::CAlphabet(EAlphabet alpha)
00088 : CSGObject()
00089 {
00090     init();
00091     set_alphabet(alpha);
00092 }
00093 
00094 CAlphabet::CAlphabet(CAlphabet* a)
00095 : CSGObject()
00096 {
00097     init();
00098     ASSERT(a);
00099     set_alphabet(a->get_alphabet());
00100     copy_histogram(a);
00101 }
00102 
00103 CAlphabet::~CAlphabet()
00104 {
00105 }
00106 
00107 bool CAlphabet::set_alphabet(EAlphabet alpha)
00108 {
00109     bool result=true;
00110     alphabet=alpha;
00111 
00112     switch (alphabet)
00113     {
00114         case DNA:
00115         case RAWDNA:
00116             num_symbols = 4;
00117             break;
00118         case RNA:
00119             num_symbols = 4;
00120             break;
00121         case PROTEIN:
00122             num_symbols = 26;
00123             break;
00124         case BINARY:
00125             num_symbols = 2;
00126             break;
00127         case ALPHANUM:
00128             num_symbols = 36;
00129             break;
00130         case CUBE:
00131             num_symbols = 6;
00132             break;
00133         case RAWBYTE:
00134             num_symbols = 256;
00135             break;
00136         case IUPAC_NUCLEIC_ACID:
00137             num_symbols = 16;
00138             break;
00139         case IUPAC_AMINO_ACID:
00140             num_symbols = 23;
00141             break;
00142         case NONE:
00143             num_symbols = 0;
00144             break;
00145         case DIGIT2:
00146             num_symbols = 3;
00147             break;
00148         case DIGIT:
00149             num_symbols = 10;
00150             break;
00151         case RAWDIGIT2:
00152             num_symbols = 3;
00153             break;
00154         case RAWDIGIT:
00155             num_symbols = 10;
00156             break;
00157         case SNP:
00158             num_symbols = 5;
00159             break;
00160         case RAWSNP:
00161             num_symbols = 5;
00162             break;
00163         default:
00164             num_symbols = 0;
00165             result=false;
00166             break;
00167     }
00168 
00169     num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00170     init_map_table();
00171     clear_histogram();
00172 
00173     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00174 
00175     return result;
00176 }
00177 
00178 void CAlphabet::init_map_table()
00179 {
00180     for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00181     {
00182         maptable_to_bin[i] = MAPTABLE_UNDEF;
00183         maptable_to_char[i] = MAPTABLE_UNDEF;
00184         valid_chars[i] = false;
00185     }
00186 
00187     switch (alphabet)
00188     {
00189         case RAWDIGIT:
00190             for (uint8_t i=0; i<=9; i++)
00191             {
00192                 valid_chars[i]=true;
00193                 maptable_to_bin[i]=i;
00194                 maptable_to_char[i]=i;
00195             }
00196             break;
00197 
00198         case RAWDIGIT2:
00199             for (uint8_t i=0; i<=2; i++)
00200             {
00201                 valid_chars[i]=true;
00202                 maptable_to_bin[i]=i;
00203                 maptable_to_char[i]=i;
00204             }
00205             break;
00206 
00207         case DIGIT:
00208             valid_chars[(uint8_t) '0']=true;
00209             valid_chars[(uint8_t) '1']=true;
00210             valid_chars[(uint8_t) '2']=true;
00211             valid_chars[(uint8_t) '3']=true;
00212             valid_chars[(uint8_t) '4']=true;
00213             valid_chars[(uint8_t) '5']=true;
00214             valid_chars[(uint8_t) '6']=true;
00215             valid_chars[(uint8_t) '7']=true;
00216             valid_chars[(uint8_t) '8']=true;
00217             valid_chars[(uint8_t) '9']=true;    //Translation '0-9' -> 0-9
00218 
00219             maptable_to_bin[(uint8_t) '0']=0;
00220             maptable_to_bin[(uint8_t) '1']=1;
00221             maptable_to_bin[(uint8_t) '2']=2;
00222             maptable_to_bin[(uint8_t) '3']=3;
00223             maptable_to_bin[(uint8_t) '4']=4;
00224             maptable_to_bin[(uint8_t) '5']=5;
00225             maptable_to_bin[(uint8_t) '6']=6;
00226             maptable_to_bin[(uint8_t) '7']=7;
00227             maptable_to_bin[(uint8_t) '8']=8;
00228             maptable_to_bin[(uint8_t) '9']=9;   //Translation '0-9' -> 0-9
00229 
00230             maptable_to_char[(uint8_t) 0]='0';
00231             maptable_to_char[(uint8_t) 1]='1';
00232             maptable_to_char[(uint8_t) 2]='2';
00233             maptable_to_char[(uint8_t) 3]='3';
00234             maptable_to_char[(uint8_t) 4]='4';
00235             maptable_to_char[(uint8_t) 5]='5';
00236             maptable_to_char[(uint8_t) 6]='6';
00237             maptable_to_char[(uint8_t) 7]='7';
00238             maptable_to_char[(uint8_t) 8]='8';
00239             maptable_to_char[(uint8_t) 9]='9';  //Translation 0-9 -> '0-9'
00240             break;
00241 
00242         case DIGIT2:
00243             valid_chars[(uint8_t) '0']=true;
00244             valid_chars[(uint8_t) '1']=true;
00245             valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2
00246 
00247             maptable_to_bin[(uint8_t) '0']=0;
00248             maptable_to_bin[(uint8_t) '1']=1;
00249             maptable_to_bin[(uint8_t) '2']=2;   //Translation '0-2' -> 0-2
00250 
00251             maptable_to_char[(uint8_t) 0]='0';
00252             maptable_to_char[(uint8_t) 1]='1';
00253             maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2'
00254             break;
00255 
00256         case CUBE:
00257             valid_chars[(uint8_t) '1']=true;
00258             valid_chars[(uint8_t) '2']=true;
00259             valid_chars[(uint8_t) '3']=true;
00260             valid_chars[(uint8_t) '4']=true;
00261             valid_chars[(uint8_t) '5']=true;
00262             valid_chars[(uint8_t) '6']=true;    //Translation '123456' -> 012345
00263 
00264             maptable_to_bin[(uint8_t) '1']=0;
00265             maptable_to_bin[(uint8_t) '2']=1;
00266             maptable_to_bin[(uint8_t) '3']=2;
00267             maptable_to_bin[(uint8_t) '4']=3;
00268             maptable_to_bin[(uint8_t) '5']=4;
00269             maptable_to_bin[(uint8_t) '6']=5;   //Translation '123456' -> 012345
00270 
00271             maptable_to_char[(uint8_t) 0]='1';
00272             maptable_to_char[(uint8_t) 1]='2';
00273             maptable_to_char[(uint8_t) 2]='3';
00274             maptable_to_char[(uint8_t) 3]='4';
00275             maptable_to_char[(uint8_t) 4]='5';
00276             maptable_to_char[(uint8_t) 5]='6';  //Translation 012345->'123456'
00277             break;
00278 
00279         case PROTEIN:
00280             {
00281                 int32_t skip=0 ;
00282                 for (int32_t i=0; i<21; i++)
00283                 {
00284                     if (i==1) skip++ ;
00285                     if (i==8) skip++ ;
00286                     if (i==12) skip++ ;
00287                     if (i==17) skip++ ;
00288                     valid_chars['A'+i+skip]=true;
00289                     maptable_to_bin['A'+i+skip]=i ;
00290                     maptable_to_char[i]='A'+i+skip ;
00291                 } ;                   //Translation 012345->acde...xy -- the protein code
00292             } ;
00293             break;
00294 
00295         case BINARY:
00296             valid_chars[(uint8_t) '0']=true;
00297             valid_chars[(uint8_t) '1']=true;
00298 
00299             maptable_to_bin[(uint8_t) '0']=0;
00300             maptable_to_bin[(uint8_t) '1']=1;
00301 
00302             maptable_to_char[0]=(uint8_t) '0';
00303             maptable_to_char[1]=(uint8_t) '1';
00304             break;
00305 
00306         case ALPHANUM:
00307             {
00308                 for (int32_t i=0; i<26; i++)
00309                 {
00310                     valid_chars['A'+i]=true;
00311                     maptable_to_bin['A'+i]=i ;
00312                     maptable_to_char[i]='A'+i ;
00313                 } ;
00314                 for (int32_t i=0; i<10; i++)
00315                 {
00316                     valid_chars['0'+i]=true;
00317                     maptable_to_bin['0'+i]=26+i ;
00318                     maptable_to_char[26+i]='0'+i ;
00319                 } ;        //Translation 012345->acde...xy0123456789
00320             } ;
00321             break;
00322 
00323         case RAWBYTE:
00324             {
00325                 //identity
00326                 for (int32_t i=0; i<256; i++)
00327                 {
00328                     valid_chars[i]=true;
00329                     maptable_to_bin[i]=i;
00330                     maptable_to_char[i]=i;
00331                 }
00332             }
00333             break;
00334 
00335         case DNA:
00336             valid_chars[(uint8_t) 'A']=true;
00337             valid_chars[(uint8_t) 'C']=true;
00338             valid_chars[(uint8_t) 'G']=true;
00339             valid_chars[(uint8_t) 'T']=true;
00340 
00341             maptable_to_bin[(uint8_t) 'A']=B_A;
00342             maptable_to_bin[(uint8_t) 'C']=B_C;
00343             maptable_to_bin[(uint8_t) 'G']=B_G;
00344             maptable_to_bin[(uint8_t) 'T']=B_T;
00345 
00346             maptable_to_char[B_A]='A';
00347             maptable_to_char[B_C]='C';
00348             maptable_to_char[B_G]='G';
00349             maptable_to_char[B_T]='T';
00350             break;
00351         case RAWDNA:
00352             {
00353                 //identity
00354                 for (int32_t i=0; i<4; i++)
00355                 {
00356                     valid_chars[i]=true;
00357                     maptable_to_bin[i]=i;
00358                     maptable_to_char[i]=i;
00359                 }
00360             }
00361             break;
00362 
00363         case SNP:
00364             valid_chars[(uint8_t) 'A']=true;
00365             valid_chars[(uint8_t) 'C']=true;
00366             valid_chars[(uint8_t) 'G']=true;
00367             valid_chars[(uint8_t) 'T']=true;
00368             valid_chars[(uint8_t) '0']=true;
00369 
00370             maptable_to_bin[(uint8_t) 'A']=B_A;
00371             maptable_to_bin[(uint8_t) 'C']=B_C;
00372             maptable_to_bin[(uint8_t) 'G']=B_G;
00373             maptable_to_bin[(uint8_t) 'T']=B_T;
00374             maptable_to_bin[(uint8_t) '0']=B_0;
00375 
00376             maptable_to_char[B_A]='A';
00377             maptable_to_char[B_C]='C';
00378             maptable_to_char[B_G]='G';
00379             maptable_to_char[B_T]='T';
00380             maptable_to_char[B_0]='0';
00381             break;
00382         case RAWSNP:
00383             {
00384                 //identity
00385                 for (int32_t i=0; i<5; i++)
00386                 {
00387                     valid_chars[i]=true;
00388                     maptable_to_bin[i]=i;
00389                     maptable_to_char[i]=i;
00390                 }
00391             }
00392             break;
00393 
00394         case RNA:
00395             valid_chars[(uint8_t) 'A']=true;
00396             valid_chars[(uint8_t) 'C']=true;
00397             valid_chars[(uint8_t) 'G']=true;
00398             valid_chars[(uint8_t) 'U']=true;
00399 
00400             maptable_to_bin[(uint8_t) 'A']=B_A;
00401             maptable_to_bin[(uint8_t) 'C']=B_C;
00402             maptable_to_bin[(uint8_t) 'G']=B_G;
00403             maptable_to_bin[(uint8_t) 'U']=B_T;
00404 
00405             maptable_to_char[B_A]='A';
00406             maptable_to_char[B_C]='C';
00407             maptable_to_char[B_G]='G';
00408             maptable_to_char[B_T]='U';
00409             break;
00410 
00411         case IUPAC_NUCLEIC_ACID:
00412             valid_chars[(uint8_t) 'A']=true; // A   Adenine
00413             valid_chars[(uint8_t) 'C']=true; // C   Cytosine
00414             valid_chars[(uint8_t) 'G']=true; // G   Guanine
00415             valid_chars[(uint8_t) 'T']=true; // T   Thymine
00416             valid_chars[(uint8_t) 'U']=true; // U   Uracil
00417             valid_chars[(uint8_t) 'R']=true; // R   Purine (A or G)
00418             valid_chars[(uint8_t) 'Y']=true; // Y   Pyrimidine (C, T, or U)
00419             valid_chars[(uint8_t) 'M']=true; // M   C or A
00420             valid_chars[(uint8_t) 'K']=true; // K   T, U, or G
00421             valid_chars[(uint8_t) 'W']=true; // W   T, U, or A
00422             valid_chars[(uint8_t) 'S']=true; // S   C or G
00423             valid_chars[(uint8_t) 'B']=true; // B   C, T, U, or G (not A)
00424             valid_chars[(uint8_t) 'D']=true; // D   A, T, U, or G (not C)
00425             valid_chars[(uint8_t) 'H']=true; // H   A, T, U, or C (not G)
00426             valid_chars[(uint8_t) 'V']=true; // V   A, C, or G (not T, not U)
00427             valid_chars[(uint8_t) 'N']=true; // N   Any base (A, C, G, T, or U)
00428 
00429             maptable_to_bin[(uint8_t) 'A']=0; // A  Adenine
00430             maptable_to_bin[(uint8_t) 'C']=1; // C  Cytosine
00431             maptable_to_bin[(uint8_t) 'G']=2; // G  Guanine
00432             maptable_to_bin[(uint8_t) 'T']=3; // T  Thymine
00433             maptable_to_bin[(uint8_t) 'U']=4; // U  Uracil
00434             maptable_to_bin[(uint8_t) 'R']=5; // R  Purine (A or G)
00435             maptable_to_bin[(uint8_t) 'Y']=6; // Y  Pyrimidine (C, T, or U)
00436             maptable_to_bin[(uint8_t) 'M']=7; // M  C or A
00437             maptable_to_bin[(uint8_t) 'K']=8; // K  T, U, or G
00438             maptable_to_bin[(uint8_t) 'W']=9; // W  T, U, or A
00439             maptable_to_bin[(uint8_t) 'S']=10; // S C or G
00440             maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
00441             maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
00442             maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
00443             maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
00444             maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
00445 
00446             maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
00447             maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
00448             maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
00449             maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
00450             maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
00451             maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
00452             maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
00453             maptable_to_char[7]=(uint8_t) 'M'; // M C or A
00454             maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
00455             maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
00456             maptable_to_char[10]=(uint8_t) 'S'; // S    C or G
00457             maptable_to_char[11]=(uint8_t) 'B'; // B    C, T, U, or G (not A)
00458             maptable_to_char[12]=(uint8_t) 'D'; // D    A, T, U, or G (not C)
00459             maptable_to_char[13]=(uint8_t) 'H'; // H    A, T, U, or C (not G)
00460             maptable_to_char[14]=(uint8_t) 'V'; // V    A, C, or G (not T, not U)
00461             maptable_to_char[15]=(uint8_t) 'N'; // N    Any base (A, C, G, T, or U)
00462             break;
00463 
00464         case IUPAC_AMINO_ACID:
00465             valid_chars[(uint8_t) 'A']=true; //A    Ala Alanine
00466             valid_chars[(uint8_t) 'R']=true; //R    Arg Arginine
00467             valid_chars[(uint8_t) 'N']=true; //N    Asn Asparagine
00468             valid_chars[(uint8_t) 'D']=true; //D    Asp Aspartic acid
00469             valid_chars[(uint8_t) 'C']=true; //C    Cys Cysteine
00470             valid_chars[(uint8_t) 'Q']=true; //Q    Gln Glutamine
00471             valid_chars[(uint8_t) 'E']=true; //E    Glu Glutamic acid
00472             valid_chars[(uint8_t) 'G']=true; //G    Gly Glycine
00473             valid_chars[(uint8_t) 'H']=true; //H    His Histidine
00474             valid_chars[(uint8_t) 'I']=true; //I    Ile Isoleucine
00475             valid_chars[(uint8_t) 'L']=true; //L    Leu Leucine
00476             valid_chars[(uint8_t) 'K']=true; //K    Lys Lysine
00477             valid_chars[(uint8_t) 'M']=true; //M    Met Methionine
00478             valid_chars[(uint8_t) 'F']=true; //F    Phe Phenylalanine
00479             valid_chars[(uint8_t) 'P']=true; //P    Pro Proline
00480             valid_chars[(uint8_t) 'S']=true; //S    Ser Serine
00481             valid_chars[(uint8_t) 'T']=true; //T    Thr Threonine
00482             valid_chars[(uint8_t) 'W']=true; //W    Trp Tryptophan
00483             valid_chars[(uint8_t) 'Y']=true; //Y    Tyr Tyrosine
00484             valid_chars[(uint8_t) 'V']=true; //V    Val Valine
00485             valid_chars[(uint8_t) 'B']=true; //B    Asx Aspartic acid or Asparagine
00486             valid_chars[(uint8_t) 'Z']=true; //Z    Glx Glutamine or Glutamic acid
00487             valid_chars[(uint8_t) 'X']=true; //X    Xaa Any amino acid
00488 
00489             maptable_to_bin[(uint8_t) 'A']=0;  //A  Ala Alanine
00490             maptable_to_bin[(uint8_t) 'R']=1;  //R  Arg Arginine
00491             maptable_to_bin[(uint8_t) 'N']=2;  //N  Asn Asparagine
00492             maptable_to_bin[(uint8_t) 'D']=3;  //D  Asp Aspartic acid
00493             maptable_to_bin[(uint8_t) 'C']=4;  //C  Cys Cysteine
00494             maptable_to_bin[(uint8_t) 'Q']=5;  //Q  Gln Glutamine
00495             maptable_to_bin[(uint8_t) 'E']=6;  //E  Glu Glutamic acid
00496             maptable_to_bin[(uint8_t) 'G']=7;  //G  Gly Glycine
00497             maptable_to_bin[(uint8_t) 'H']=8;  //H  His Histidine
00498             maptable_to_bin[(uint8_t) 'I']=9;  //I  Ile Isoleucine
00499             maptable_to_bin[(uint8_t) 'L']=10; //L  Leu Leucine
00500             maptable_to_bin[(uint8_t) 'K']=11; //K  Lys Lysine
00501             maptable_to_bin[(uint8_t) 'M']=12; //M  Met Methionine
00502             maptable_to_bin[(uint8_t) 'F']=13; //F  Phe Phenylalanine
00503             maptable_to_bin[(uint8_t) 'P']=14; //P  Pro Proline
00504             maptable_to_bin[(uint8_t) 'S']=15; //S  Ser Serine
00505             maptable_to_bin[(uint8_t) 'T']=16; //T  Thr Threonine
00506             maptable_to_bin[(uint8_t) 'W']=17; //W  Trp Tryptophan
00507             maptable_to_bin[(uint8_t) 'Y']=18; //Y  Tyr Tyrosine
00508             maptable_to_bin[(uint8_t) 'V']=19; //V  Val Valine
00509             maptable_to_bin[(uint8_t) 'B']=20; //B  Asx Aspartic acid or Asparagine
00510             maptable_to_bin[(uint8_t) 'Z']=21; //Z  Glx Glutamine or Glutamic acid
00511             maptable_to_bin[(uint8_t) 'X']=22; //X  Xaa Any amino acid
00512 
00513             maptable_to_char[0]=(uint8_t) 'A';  //A Ala Alanine
00514             maptable_to_char[1]=(uint8_t) 'R';  //R Arg Arginine
00515             maptable_to_char[2]=(uint8_t) 'N';  //N Asn Asparagine
00516             maptable_to_char[3]=(uint8_t) 'D';  //D Asp Aspartic acid
00517             maptable_to_char[4]=(uint8_t) 'C';  //C Cys Cysteine
00518             maptable_to_char[5]=(uint8_t) 'Q';  //Q Gln Glutamine
00519             maptable_to_char[6]=(uint8_t) 'E';  //E Glu Glutamic acid
00520             maptable_to_char[7]=(uint8_t) 'G';  //G Gly Glycine
00521             maptable_to_char[8]=(uint8_t) 'H';  //H His Histidine
00522             maptable_to_char[9]=(uint8_t) 'I';  //I Ile Isoleucine
00523             maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
00524             maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
00525             maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
00526             maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
00527             maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
00528             maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
00529             maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
00530             maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
00531             maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
00532             maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
00533             maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
00534             maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
00535             maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
00536             break;
00537 
00538         default:
00539             break; //leave uninitialised
00540     };
00541 }
00542 
00543 void CAlphabet::clear_histogram()
00544 {
00545     memset(histogram, 0, sizeof(histogram));
00546     print_histogram();
00547 }
00548 
00549 int32_t CAlphabet::get_max_value_in_histogram()
00550 {
00551     int32_t max_sym=-1;
00552     for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00553     {
00554         if (histogram[i])
00555         {
00556             max_sym=i;
00557             break;
00558         }
00559     }
00560 
00561     return max_sym;
00562 }
00563 
00564 int32_t CAlphabet::get_num_symbols_in_histogram()
00565 {
00566     int32_t num_sym=0;
00567     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00568     {
00569         if (histogram[i])
00570             num_sym++;
00571     }
00572 
00573     return num_sym;
00574 }
00575 
00576 int32_t CAlphabet::get_num_bits_in_histogram()
00577 {
00578     int32_t num_sym=get_num_symbols_in_histogram();
00579     if (num_sym>0)
00580         return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00581     else
00582         return 0;
00583 }
00584 
00585 void CAlphabet::print_histogram()
00586 {
00587     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00588     {
00589         if (histogram[i])
00590             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00591     }
00592 }
00593 
00594 bool CAlphabet::check_alphabet(bool print_error)
00595 {
00596     bool result = true;
00597 
00598     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00599     {
00600         if (histogram[i]>0 && valid_chars[i]==0)
00601         {
00602             result=false;
00603             break;
00604         }
00605     }
00606 
00607     if (!result && print_error)
00608     {
00609         print_histogram();
00610         SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00611     }
00612 
00613     return result;
00614 }
00615 
00616 bool CAlphabet::check_alphabet_size(bool print_error)
00617 {
00618     if (get_num_bits_in_histogram() > get_num_bits())
00619     {
00620         if (print_error)
00621         {
00622             print_histogram();
00623             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00624          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00625         }
00626         return false;
00627     }
00628     else
00629         return true;
00630 
00631 }
00632 
00633 void CAlphabet::copy_histogram(CAlphabet* a)
00634 {
00635     memcpy(histogram, a->get_histogram(), sizeof(histogram));
00636 }
00637 
00638 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00639 {
00640 
00641     int32_t idx;
00642     switch (alphabet)
00643     {
00644         case DNA:
00645             idx=0;
00646             break;
00647         case RAWDNA:
00648             idx=1;
00649             break;
00650         case RNA:
00651             idx=2;
00652             break;
00653         case PROTEIN:
00654             idx=3;
00655             break;
00656         case BINARY:
00657             idx=4;
00658             break;
00659         case ALPHANUM:
00660             idx=5;
00661             break;
00662         case CUBE:
00663             idx=6;
00664             break;
00665         case RAWBYTE:
00666             idx=7;
00667             break;
00668         case IUPAC_NUCLEIC_ACID:
00669             idx=8;
00670             break;
00671         case IUPAC_AMINO_ACID:
00672             idx=9;
00673             break;
00674         case NONE:
00675             idx=10;
00676             break;
00677         case DIGIT:
00678             idx=11;
00679             break;
00680         case DIGIT2:
00681             idx=12;
00682             break;
00683         default:
00684             idx=13;
00685             break;
00686     }
00687     return alphabet_names[idx];
00688 }
00689 
00690 void CAlphabet::init()
00691 {
00692     alphabet = NONE;
00693     num_symbols = 0;
00694     num_bits = 0;
00695 
00696     memset(valid_chars, 0, sizeof (valid_chars));
00697     memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
00698     memset(maptable_to_char, 0, sizeof (maptable_to_char));
00699     memset(histogram, 0, sizeof (histogram));
00700 
00701 
00702     m_parameters->add((machine_int_t*) &alphabet, "alphabet",
00703                       "Alphabet enum.");
00704     m_parameters->add(&num_symbols, "num_symbols",
00705                       "Number of symbols.");
00706     m_parameters->add(&num_bits, "num_bits",
00707                       "Number of bits.");
00708 
00709     /* We don't need to serialize the mapping tables / they can be computed
00710      * after de-serializing. Lets not serialize the histogram for now. Doesn't
00711      * really make sense.  */
00712 
00713     /* m_parameters->add_histogram(&histogram, sizeof(histogram),
00714             "histogram",
00715             "Histogram."); */
00716 }
00717 
00718 void CAlphabet::load_serializable_post(void) throw (ShogunException)
00719 {
00720     CSGObject::load_serializable_post();
00721 
00722     init_map_table();
00723 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation