Alphabet.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include <shogun/features/Alphabet.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/base/Parameter.h>
00017 
00018 using namespace shogun;
00019 
00020 //define numbers for the bases
00021 const uint8_t CAlphabet::B_A=0;
00022 const uint8_t CAlphabet::B_C=1;
00023 const uint8_t CAlphabet::B_G=2;
00024 const uint8_t CAlphabet::B_T=3;
00025 const uint8_t CAlphabet::B_0=4;
00026 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00027 const char* CAlphabet::alphabet_names[18]={
00028     "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00029     "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00030     "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00031     "SNP", "RAWSNP"};
00032 
00033 
00034 CAlphabet::CAlphabet()
00035   : CSGObject()
00036 {
00037     init();
00038 }
00039 
00040 CAlphabet::CAlphabet(char* al, int32_t len)
00041 : CSGObject()
00042 {
00043     init();
00044 
00045     EAlphabet alpha=NONE;
00046 
00047     if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00048         alpha = DNA;
00049     else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00050         alpha = RAWDNA;
00051     else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00052         alpha = RNA;
00053     else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00054         alpha = PROTEIN;
00055     else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00056         alpha = BINARY;
00057     else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00058         alpha = ALPHANUM;
00059     else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00060         alpha = CUBE;
00061     else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00062         alpha = DIGIT2;
00063     else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00064         alpha = DIGIT;
00065     else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00066         alpha = RAWDIGIT2;
00067     else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00068         alpha = RAWDIGIT;
00069     else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00070         alpha = SNP;
00071     else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00072         alpha = RAWSNP;
00073     else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00074             (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00075         alpha = RAWBYTE;
00076     else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00077         alpha = IUPAC_NUCLEIC_ACID;
00078     else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00079         alpha = IUPAC_AMINO_ACID;
00080     else {
00081       SG_ERROR( "unknown alphabet %s\n", al);
00082    }
00083 
00084     set_alphabet(alpha);
00085 }
00086 
00087 CAlphabet::CAlphabet(EAlphabet alpha)
00088 : CSGObject()
00089 {
00090     init();
00091     set_alphabet(alpha);
00092 }
00093 
00094 CAlphabet::CAlphabet(CAlphabet* a)
00095 : CSGObject()
00096 {
00097     init();
00098     ASSERT(a);
00099     set_alphabet(a->get_alphabet());
00100     copy_histogram(a);
00101 }
00102 
00103 CAlphabet::~CAlphabet()
00104 {
00105 }
00106 
00107 bool CAlphabet::set_alphabet(EAlphabet alpha)
00108 {
00109     bool result=true;
00110     alphabet=alpha;
00111 
00112     switch (alphabet)
00113     {
00114         case DNA:
00115         case RAWDNA:
00116             num_symbols = 4;
00117             break;
00118         case RNA:
00119             num_symbols = 4;
00120             break;
00121         case PROTEIN:
00122             num_symbols = 26;
00123             break;
00124         case BINARY:
00125             num_symbols = 2;
00126             break;
00127         case ALPHANUM:
00128             num_symbols = 36;
00129             break;
00130         case CUBE:
00131             num_symbols = 6;
00132             break;
00133         case RAWBYTE:
00134             num_symbols = 256;
00135             break;
00136         case IUPAC_NUCLEIC_ACID:
00137             num_symbols = 16;
00138             break;
00139         case IUPAC_AMINO_ACID:
00140             num_symbols = 23;
00141             break;
00142         case NONE:
00143             num_symbols = 0;
00144             break;
00145         case DIGIT2:
00146             num_symbols = 3;
00147             break;
00148         case DIGIT:
00149             num_symbols = 10;
00150             break;
00151         case RAWDIGIT2:
00152             num_symbols = 3;
00153             break;
00154         case RAWDIGIT:
00155             num_symbols = 10;
00156             break;
00157         case SNP:
00158             num_symbols = 5;
00159             break;
00160         case RAWSNP:
00161             num_symbols = 5;
00162             break;
00163         default:
00164             num_symbols = 0;
00165             result=false;
00166             break;
00167     }
00168 
00169     num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00170     init_map_table();
00171     clear_histogram();
00172 
00173     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00174 
00175     return result;
00176 }
00177 
00178 void CAlphabet::init_map_table()
00179 {
00180     for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00181     {
00182         maptable_to_bin[i] = MAPTABLE_UNDEF;
00183         maptable_to_char[i] = MAPTABLE_UNDEF;
00184         valid_chars[i] = false;
00185     }
00186 
00187     switch (alphabet)
00188     {
00189         case RAWDIGIT:
00190             for (uint8_t i=0; i<=9; i++)
00191             {
00192                 valid_chars[i]=true;
00193                 maptable_to_bin[i]=i;
00194                 maptable_to_char[i]=i;
00195             }
00196             break;
00197 
00198         case RAWDIGIT2:
00199             for (uint8_t i=0; i<=2; i++)
00200             {
00201                 valid_chars[i]=true;
00202                 maptable_to_bin[i]=i;
00203                 maptable_to_char[i]=i;
00204             }
00205             break;
00206 
00207         case DIGIT:
00208             valid_chars[(uint8_t) '0']=true;
00209             valid_chars[(uint8_t) '1']=true;
00210             valid_chars[(uint8_t) '2']=true;
00211             valid_chars[(uint8_t) '3']=true;
00212             valid_chars[(uint8_t) '4']=true;
00213             valid_chars[(uint8_t) '5']=true;
00214             valid_chars[(uint8_t) '6']=true;
00215             valid_chars[(uint8_t) '7']=true;
00216             valid_chars[(uint8_t) '8']=true;
00217             valid_chars[(uint8_t) '9']=true;    //Translation '0-9' -> 0-9
00218 
00219             maptable_to_bin[(uint8_t) '0']=0;
00220             maptable_to_bin[(uint8_t) '1']=1;
00221             maptable_to_bin[(uint8_t) '2']=2;
00222             maptable_to_bin[(uint8_t) '3']=3;
00223             maptable_to_bin[(uint8_t) '4']=4;
00224             maptable_to_bin[(uint8_t) '5']=5;
00225             maptable_to_bin[(uint8_t) '6']=6;
00226             maptable_to_bin[(uint8_t) '7']=7;
00227             maptable_to_bin[(uint8_t) '8']=8;
00228             maptable_to_bin[(uint8_t) '9']=9;   //Translation '0-9' -> 0-9
00229 
00230             maptable_to_char[(uint8_t) 0]='0';
00231             maptable_to_char[(uint8_t) 1]='1';
00232             maptable_to_char[(uint8_t) 2]='2';
00233             maptable_to_char[(uint8_t) 3]='3';
00234             maptable_to_char[(uint8_t) 4]='4';
00235             maptable_to_char[(uint8_t) 5]='5';
00236             maptable_to_char[(uint8_t) 6]='6';
00237             maptable_to_char[(uint8_t) 7]='7';
00238             maptable_to_char[(uint8_t) 8]='8';
00239             maptable_to_char[(uint8_t) 9]='9';  //Translation 0-9 -> '0-9'
00240             break;
00241 
00242         case DIGIT2:
00243             valid_chars[(uint8_t) '0']=true;
00244             valid_chars[(uint8_t) '1']=true;
00245             valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2
00246 
00247             maptable_to_bin[(uint8_t) '0']=0;
00248             maptable_to_bin[(uint8_t) '1']=1;
00249             maptable_to_bin[(uint8_t) '2']=2;   //Translation '0-2' -> 0-2
00250 
00251             maptable_to_char[(uint8_t) 0]='0';
00252             maptable_to_char[(uint8_t) 1]='1';
00253             maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2'
00254             break;
00255 
00256         case CUBE:
00257             valid_chars[(uint8_t) '1']=true;
00258             valid_chars[(uint8_t) '2']=true;
00259             valid_chars[(uint8_t) '3']=true;
00260             valid_chars[(uint8_t) '4']=true;
00261             valid_chars[(uint8_t) '5']=true;
00262             valid_chars[(uint8_t) '6']=true;    //Translation '123456' -> 012345
00263 
00264             maptable_to_bin[(uint8_t) '1']=0;
00265             maptable_to_bin[(uint8_t) '2']=1;
00266             maptable_to_bin[(uint8_t) '3']=2;
00267             maptable_to_bin[(uint8_t) '4']=3;
00268             maptable_to_bin[(uint8_t) '5']=4;
00269             maptable_to_bin[(uint8_t) '6']=5;   //Translation '123456' -> 012345
00270 
00271             maptable_to_char[(uint8_t) 0]='1';
00272             maptable_to_char[(uint8_t) 1]='2';
00273             maptable_to_char[(uint8_t) 2]='3';
00274             maptable_to_char[(uint8_t) 3]='4';
00275             maptable_to_char[(uint8_t) 4]='5';
00276             maptable_to_char[(uint8_t) 5]='6';  //Translation 012345->'123456'
00277             break;
00278 
00279         case PROTEIN:
00280             {
00281                 int32_t skip=0 ;
00282                 for (int32_t i=0; i<21; i++)
00283                 {
00284                     if (i==1) skip++ ;
00285                     if (i==8) skip++ ;
00286                     if (i==12) skip++ ;
00287                     if (i==17) skip++ ;
00288                     valid_chars['A'+i+skip]=true;
00289                     maptable_to_bin['A'+i+skip]=i ;
00290                     maptable_to_char[i]='A'+i+skip ;
00291                 } ;                   //Translation 012345->acde...xy -- the protein code
00292             } ;
00293             break;
00294 
00295         case BINARY:
00296             valid_chars[(uint8_t) '0']=true;
00297             valid_chars[(uint8_t) '1']=true;
00298 
00299             maptable_to_bin[(uint8_t) '0']=0;
00300             maptable_to_bin[(uint8_t) '1']=1;
00301 
00302             maptable_to_char[0]=(uint8_t) '0';
00303             maptable_to_char[1]=(uint8_t) '1';
00304             break;
00305 
00306         case ALPHANUM:
00307             {
00308                 for (int32_t i=0; i<26; i++)
00309                 {
00310                     valid_chars[(uint8_t) 'A'+i]=true;
00311                     maptable_to_bin[(uint8_t) 'A'+i]=i ;
00312                     maptable_to_char[i]='A'+i ;
00313                 } ;
00314                 for (int32_t i=0; i<10; i++)
00315                 {
00316                     valid_chars[(uint8_t) '0'+i]=true;
00317                     maptable_to_bin[(uint8_t) '0'+i]=26+i ;
00318                     maptable_to_char[26+i]='0'+i ;
00319                 } ;        //Translation 012345->acde...xy0123456789
00320             } ;
00321             break;
00322 
00323         case RAWBYTE:
00324             {
00325                 //identity
00326                 for (int32_t i=0; i<256; i++)
00327                 {
00328                     valid_chars[i]=true;
00329                     maptable_to_bin[i]=i;
00330                     maptable_to_char[i]=i;
00331                 }
00332             }
00333             break;
00334 
00335         case DNA:
00336             valid_chars[(uint8_t) 'A']=true;
00337             valid_chars[(uint8_t) 'C']=true;
00338             valid_chars[(uint8_t) 'G']=true;
00339             valid_chars[(uint8_t) 'T']=true;
00340 
00341             maptable_to_bin[(uint8_t) 'A']=B_A;
00342             maptable_to_bin[(uint8_t) 'C']=B_C;
00343             maptable_to_bin[(uint8_t) 'G']=B_G;
00344             maptable_to_bin[(uint8_t) 'T']=B_T;
00345 
00346             maptable_to_char[B_A]='A';
00347             maptable_to_char[B_C]='C';
00348             maptable_to_char[B_G]='G';
00349             maptable_to_char[B_T]='T';
00350             break;
00351         case RAWDNA:
00352             {
00353                 //identity
00354                 for (int32_t i=0; i<4; i++)
00355                 {
00356                     valid_chars[i]=true;
00357                     maptable_to_bin[i]=i;
00358                     maptable_to_char[i]=i;
00359                 }
00360             }
00361             break;
00362 
00363         case SNP:
00364             valid_chars[(uint8_t) 'A']=true;
00365             valid_chars[(uint8_t) 'C']=true;
00366             valid_chars[(uint8_t) 'G']=true;
00367             valid_chars[(uint8_t) 'T']=true;
00368             valid_chars[(uint8_t) '0']=true;
00369 
00370             maptable_to_bin[(uint8_t) 'A']=B_A;
00371             maptable_to_bin[(uint8_t) 'C']=B_C;
00372             maptable_to_bin[(uint8_t) 'G']=B_G;
00373             maptable_to_bin[(uint8_t) 'T']=B_T;
00374             maptable_to_bin[(uint8_t) '0']=B_0;
00375 
00376             maptable_to_char[B_A]='A';
00377             maptable_to_char[B_C]='C';
00378             maptable_to_char[B_G]='G';
00379             maptable_to_char[B_T]='T';
00380             maptable_to_char[B_0]='0';
00381             break;
00382         case RAWSNP:
00383             {
00384                 //identity
00385                 for (int32_t i=0; i<5; i++)
00386                 {
00387                     valid_chars[i]=true;
00388                     maptable_to_bin[i]=i;
00389                     maptable_to_char[i]=i;
00390                 }
00391             }
00392             break;
00393 
00394         case RNA:
00395             valid_chars[(uint8_t) 'A']=true;
00396             valid_chars[(uint8_t) 'C']=true;
00397             valid_chars[(uint8_t) 'G']=true;
00398             valid_chars[(uint8_t) 'U']=true;
00399 
00400             maptable_to_bin[(uint8_t) 'A']=B_A;
00401             maptable_to_bin[(uint8_t) 'C']=B_C;
00402             maptable_to_bin[(uint8_t) 'G']=B_G;
00403             maptable_to_bin[(uint8_t) 'U']=B_T;
00404 
00405             maptable_to_char[B_A]='A';
00406             maptable_to_char[B_C]='C';
00407             maptable_to_char[B_G]='G';
00408             maptable_to_char[B_T]='U';
00409             break;
00410 
00411         case IUPAC_NUCLEIC_ACID:
00412             valid_chars[(uint8_t) 'A']=true; // A   Adenine
00413             valid_chars[(uint8_t) 'C']=true; // C   Cytosine
00414             valid_chars[(uint8_t) 'G']=true; // G   Guanine
00415             valid_chars[(uint8_t) 'T']=true; // T   Thymine
00416             valid_chars[(uint8_t) 'U']=true; // U   Uracil
00417             valid_chars[(uint8_t) 'R']=true; // R   Purine (A or G)
00418             valid_chars[(uint8_t) 'Y']=true; // Y   Pyrimidine (C, T, or U)
00419             valid_chars[(uint8_t) 'M']=true; // M   C or A
00420             valid_chars[(uint8_t) 'K']=true; // K   T, U, or G
00421             valid_chars[(uint8_t) 'W']=true; // W   T, U, or A
00422             valid_chars[(uint8_t) 'S']=true; // S   C or G
00423             valid_chars[(uint8_t) 'B']=true; // B   C, T, U, or G (not A)
00424             valid_chars[(uint8_t) 'D']=true; // D   A, T, U, or G (not C)
00425             valid_chars[(uint8_t) 'H']=true; // H   A, T, U, or C (not G)
00426             valid_chars[(uint8_t) 'V']=true; // V   A, C, or G (not T, not U)
00427             valid_chars[(uint8_t) 'N']=true; // N   Any base (A, C, G, T, or U)
00428 
00429             maptable_to_bin[(uint8_t) 'A']=0; // A  Adenine
00430             maptable_to_bin[(uint8_t) 'C']=1; // C  Cytosine
00431             maptable_to_bin[(uint8_t) 'G']=2; // G  Guanine
00432             maptable_to_bin[(uint8_t) 'T']=3; // T  Thymine
00433             maptable_to_bin[(uint8_t) 'U']=4; // U  Uracil
00434             maptable_to_bin[(uint8_t) 'R']=5; // R  Purine (A or G)
00435             maptable_to_bin[(uint8_t) 'Y']=6; // Y  Pyrimidine (C, T, or U)
00436             maptable_to_bin[(uint8_t) 'M']=7; // M  C or A
00437             maptable_to_bin[(uint8_t) 'K']=8; // K  T, U, or G
00438             maptable_to_bin[(uint8_t) 'W']=9; // W  T, U, or A
00439             maptable_to_bin[(uint8_t) 'S']=10; // S C or G
00440             maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
00441             maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
00442             maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
00443             maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
00444             maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
00445 
00446             maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
00447             maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
00448             maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
00449             maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
00450             maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
00451             maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
00452             maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
00453             maptable_to_char[7]=(uint8_t) 'M'; // M C or A
00454             maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
00455             maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
00456             maptable_to_char[10]=(uint8_t) 'S'; // S    C or G
00457             maptable_to_char[11]=(uint8_t) 'B'; // B    C, T, U, or G (not A)
00458             maptable_to_char[12]=(uint8_t) 'D'; // D    A, T, U, or G (not C)
00459             maptable_to_char[13]=(uint8_t) 'H'; // H    A, T, U, or C (not G)
00460             maptable_to_char[14]=(uint8_t) 'V'; // V    A, C, or G (not T, not U)
00461             maptable_to_char[15]=(uint8_t) 'N'; // N    Any base (A, C, G, T, or U)
00462             break;
00463 
00464         case IUPAC_AMINO_ACID:
00465             valid_chars[(uint8_t) 'A']=true; //A    Ala Alanine
00466             valid_chars[(uint8_t) 'R']=true; //R    Arg Arginine
00467             valid_chars[(uint8_t) 'N']=true; //N    Asn Asparagine
00468             valid_chars[(uint8_t) 'D']=true; //D    Asp Aspartic acid
00469             valid_chars[(uint8_t) 'C']=true; //C    Cys Cysteine
00470             valid_chars[(uint8_t) 'Q']=true; //Q    Gln Glutamine
00471             valid_chars[(uint8_t) 'E']=true; //E    Glu Glutamic acid
00472             valid_chars[(uint8_t) 'G']=true; //G    Gly Glycine
00473             valid_chars[(uint8_t) 'H']=true; //H    His Histidine
00474             valid_chars[(uint8_t) 'I']=true; //I    Ile Isoleucine
00475             valid_chars[(uint8_t) 'L']=true; //L    Leu Leucine
00476             valid_chars[(uint8_t) 'K']=true; //K    Lys Lysine
00477             valid_chars[(uint8_t) 'M']=true; //M    Met Methionine
00478             valid_chars[(uint8_t) 'F']=true; //F    Phe Phenylalanine
00479             valid_chars[(uint8_t) 'P']=true; //P    Pro Proline
00480             valid_chars[(uint8_t) 'S']=true; //S    Ser Serine
00481             valid_chars[(uint8_t) 'T']=true; //T    Thr Threonine
00482             valid_chars[(uint8_t) 'W']=true; //W    Trp Tryptophan
00483             valid_chars[(uint8_t) 'Y']=true; //Y    Tyr Tyrosine
00484             valid_chars[(uint8_t) 'V']=true; //V    Val Valine
00485             valid_chars[(uint8_t) 'B']=true; //B    Asx Aspartic acid or Asparagine
00486             valid_chars[(uint8_t) 'Z']=true; //Z    Glx Glutamine or Glutamic acid
00487             valid_chars[(uint8_t) 'X']=true; //X    Xaa Any amino acid
00488 
00489             maptable_to_bin[(uint8_t) 'A']=0;  //A  Ala Alanine
00490             maptable_to_bin[(uint8_t) 'R']=1;  //R  Arg Arginine
00491             maptable_to_bin[(uint8_t) 'N']=2;  //N  Asn Asparagine
00492             maptable_to_bin[(uint8_t) 'D']=3;  //D  Asp Aspartic acid
00493             maptable_to_bin[(uint8_t) 'C']=4;  //C  Cys Cysteine
00494             maptable_to_bin[(uint8_t) 'Q']=5;  //Q  Gln Glutamine
00495             maptable_to_bin[(uint8_t) 'E']=6;  //E  Glu Glutamic acid
00496             maptable_to_bin[(uint8_t) 'G']=7;  //G  Gly Glycine
00497             maptable_to_bin[(uint8_t) 'H']=8;  //H  His Histidine
00498             maptable_to_bin[(uint8_t) 'I']=9;  //I  Ile Isoleucine
00499             maptable_to_bin[(uint8_t) 'L']=10; //L  Leu Leucine
00500             maptable_to_bin[(uint8_t) 'K']=11; //K  Lys Lysine
00501             maptable_to_bin[(uint8_t) 'M']=12; //M  Met Methionine
00502             maptable_to_bin[(uint8_t) 'F']=13; //F  Phe Phenylalanine
00503             maptable_to_bin[(uint8_t) 'P']=14; //P  Pro Proline
00504             maptable_to_bin[(uint8_t) 'S']=15; //S  Ser Serine
00505             maptable_to_bin[(uint8_t) 'T']=16; //T  Thr Threonine
00506             maptable_to_bin[(uint8_t) 'W']=17; //W  Trp Tryptophan
00507             maptable_to_bin[(uint8_t) 'Y']=18; //Y  Tyr Tyrosine
00508             maptable_to_bin[(uint8_t) 'V']=19; //V  Val Valine
00509             maptable_to_bin[(uint8_t) 'B']=20; //B  Asx Aspartic acid or Asparagine
00510             maptable_to_bin[(uint8_t) 'Z']=21; //Z  Glx Glutamine or Glutamic acid
00511             maptable_to_bin[(uint8_t) 'X']=22; //X  Xaa Any amino acid
00512 
00513             maptable_to_char[0]=(uint8_t) 'A';  //A Ala Alanine
00514             maptable_to_char[1]=(uint8_t) 'R';  //R Arg Arginine
00515             maptable_to_char[2]=(uint8_t) 'N';  //N Asn Asparagine
00516             maptable_to_char[3]=(uint8_t) 'D';  //D Asp Aspartic acid
00517             maptable_to_char[4]=(uint8_t) 'C';  //C Cys Cysteine
00518             maptable_to_char[5]=(uint8_t) 'Q';  //Q Gln Glutamine
00519             maptable_to_char[6]=(uint8_t) 'E';  //E Glu Glutamic acid
00520             maptable_to_char[7]=(uint8_t) 'G';  //G Gly Glycine
00521             maptable_to_char[8]=(uint8_t) 'H';  //H His Histidine
00522             maptable_to_char[9]=(uint8_t) 'I';  //I Ile Isoleucine
00523             maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
00524             maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
00525             maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
00526             maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
00527             maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
00528             maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
00529             maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
00530             maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
00531             maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
00532             maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
00533             maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
00534             maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
00535             maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
00536             break;
00537 
00538         default:
00539             break; //leave uninitialised
00540     };
00541 }
00542 
00543 void CAlphabet::clear_histogram()
00544 {
00545     memset(histogram, 0, sizeof(histogram));
00546     print_histogram();
00547 }
00548 
00549 int32_t CAlphabet::get_max_value_in_histogram()
00550 {
00551     int32_t max_sym=-1;
00552     for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00553     {
00554         if (histogram[i])
00555         {
00556             max_sym=i;
00557             break;
00558         }
00559     }
00560 
00561     return max_sym;
00562 }
00563 
00564 int32_t CAlphabet::get_num_symbols_in_histogram()
00565 {
00566     int32_t num_sym=0;
00567     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00568     {
00569         if (histogram[i])
00570             num_sym++;
00571     }
00572 
00573     return num_sym;
00574 }
00575 
00576 int32_t CAlphabet::get_num_bits_in_histogram()
00577 {
00578     int32_t num_sym=get_num_symbols_in_histogram();
00579     if (num_sym>0)
00580         return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00581     else
00582         return 0;
00583 }
00584 
00585 
00586 void CAlphabet::print_histogram()
00587 {
00588     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00589     {
00590         if (histogram[i])
00591             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00592     }
00593 }
00594 
00595 SGVector<int64_t> CAlphabet::get_histogram()
00596 {
00597     return SGVector<int64_t>(&histogram[0], 1 << (sizeof(uint8_t)*8), false);
00598 }
00599 
00600 bool CAlphabet::check_alphabet(bool print_error)
00601 {
00602     bool result = true;
00603 
00604     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00605     {
00606         if (histogram[i]>0 && valid_chars[i]==0)
00607         {
00608             result=false;
00609             break;
00610         }
00611     }
00612 
00613     if (!result && print_error)
00614     {
00615         print_histogram();
00616         SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00617     }
00618 
00619     return result;
00620 }
00621 
00622 bool CAlphabet::check_alphabet_size(bool print_error)
00623 {
00624     if (get_num_bits_in_histogram() > get_num_bits())
00625     {
00626         if (print_error)
00627         {
00628             print_histogram();
00629             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00630          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00631         }
00632         return false;
00633     }
00634     else
00635         return true;
00636 
00637 }
00638 
00639 void CAlphabet::copy_histogram(CAlphabet* a)
00640 {
00641     SGVector<int64_t> h=a->get_histogram();
00642 
00643     if (h.vlen != sizeof(histogram)/sizeof(histogram[0]))
00644     {
00645         SG_ERROR("Histogram has %d elements, but %d elements where expected\n",
00646                 h.vlen, sizeof(histogram)/sizeof(histogram[0]));
00647     }
00648 
00649     memcpy(histogram, h.vector, sizeof(histogram));
00650 }
00651 
00652 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00653 {
00654 
00655     int32_t idx;
00656     switch (alphabet)
00657     {
00658         case DNA:
00659             idx=0;
00660             break;
00661         case RAWDNA:
00662             idx=1;
00663             break;
00664         case RNA:
00665             idx=2;
00666             break;
00667         case PROTEIN:
00668             idx=3;
00669             break;
00670         case BINARY:
00671             idx=4;
00672             break;
00673         case ALPHANUM:
00674             idx=5;
00675             break;
00676         case CUBE:
00677             idx=6;
00678             break;
00679         case RAWBYTE:
00680             idx=7;
00681             break;
00682         case IUPAC_NUCLEIC_ACID:
00683             idx=8;
00684             break;
00685         case IUPAC_AMINO_ACID:
00686             idx=9;
00687             break;
00688         case NONE:
00689             idx=10;
00690             break;
00691         case DIGIT:
00692             idx=11;
00693             break;
00694         case DIGIT2:
00695             idx=12;
00696             break;
00697         default:
00698             idx=13;
00699             break;
00700     }
00701     return alphabet_names[idx];
00702 }
00703 
00704 void CAlphabet::init()
00705 {
00706     alphabet = NONE;
00707     num_symbols = 0;
00708     num_bits = 0;
00709 
00710     memset(valid_chars, 0, sizeof (valid_chars));
00711     memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
00712     memset(maptable_to_char, 0, sizeof (maptable_to_char));
00713     memset(histogram, 0, sizeof (histogram));
00714 
00715 
00716     m_parameters->add((machine_int_t*) &alphabet, "alphabet",
00717                       "Alphabet enum.");
00718     m_parameters->add(&num_symbols, "num_symbols",
00719                       "Number of symbols.");
00720     m_parameters->add(&num_bits, "num_bits",
00721                       "Number of bits.");
00722 
00723     /* We don't need to serialize the mapping tables / they can be computed
00724      * after de-serializing. Lets not serialize the histogram for now. Doesn't
00725      * really make sense.  */
00726 
00727     /* m_parameters->add_histogram(&histogram, sizeof(histogram),
00728             "histogram",
00729             "Histogram."); */
00730 }
00731 
00732 void CAlphabet::load_serializable_post() throw (ShogunException)
00733 {
00734     CSGObject::load_serializable_post();
00735 
00736     init_map_table();
00737 }
00738 
00739 
00740 namespace shogun
00741 {
00742 template <class ST>
00743 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00744 {
00745     int32_t i,j;
00746     ST value=0;
00747 
00748     for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00749     {
00750         value=0;
00751         for (j=i; j>=i-p_order+1; j--)
00752             value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00753 
00754         obs[i]= (ST) value;
00755     }
00756 
00757     for (i=p_order-2;i>=0;i--)
00758     {
00759         if (i>=sequence_length)
00760             continue;
00761 
00762         value=0;
00763         for (j=i; j>=i-p_order+1; j--)
00764         {
00765             value= (value >> max_val);
00766             if (j>=0 && j<sequence_length)
00767                 value|=obs[j] << (max_val * (p_order-1));
00768         }
00769         obs[i]=value;
00770     }
00771 
00772     // TODO we should get rid of this loop!
00773     if (start>0)
00774     {
00775         for (i=start; i<sequence_length; i++)
00776             obs[i-start]=obs[i];
00777     }
00778 }
00779 
00780 template <class ST>
00781 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00782 {
00783     int32_t i,j;
00784     ST value=0;
00785 
00786     for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00787     {
00788         value=0;
00789         for (j=i; j>=i-p_order+1; j--)
00790             value= (value << max_val) | obs[j];
00791 
00792         obs[i]= (ST) value;
00793     }
00794 
00795     for (i=p_order-2;i>=0;i--)
00796     {
00797         if (i>=sequence_length)
00798             continue;
00799 
00800         value=0;
00801         for (j=i; j>=i-p_order+1; j--)
00802         {
00803             value= (value << max_val);
00804             if (j>=0 && j<sequence_length)
00805                 value|=obs[j];
00806         }
00807         obs[i]=value;
00808     }
00809 
00810     // TODO we should get rid of this loop!
00811     if (start>0)
00812     {
00813         for (i=start; i<sequence_length; i++)
00814             obs[i-start]=obs[i];
00815     }
00816 }
00817 
00818 template <class ST>
00819 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00820 {
00821     ASSERT(gap>=0);
00822 
00823     const int32_t start_gap=(p_order-gap)/2;
00824     const int32_t end_gap=start_gap+gap;
00825 
00826     int32_t i,j;
00827     ST value=0;
00828 
00829     // almost all positions
00830     for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00831     {
00832         value=0;
00833         for (j=i; j>=i-p_order+1; j--)
00834         {
00835             if (i-j<start_gap)
00836             {
00837                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00838             }
00839             else if (i-j>=end_gap)
00840             {
00841                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00842             }
00843         }
00844         obs[i]= (ST) value;
00845     }
00846 
00847     // the remaining `order` positions
00848     for (i=p_order-2;i>=0;i--)
00849     {
00850         if (i>=sequence_length)
00851             continue;
00852 
00853         value=0;
00854         for (j=i; j>=i-p_order+1; j--)
00855         {
00856             if (i-j<start_gap)
00857             {
00858                 value= (value >> max_val);
00859                 if (j>=0 && j<sequence_length)
00860                     value|=obs[j] << (max_val * (p_order-1-gap));
00861             }
00862             else if (i-j>=end_gap)
00863             {
00864                 value= (value >> max_val);
00865                 if (j>=0 && j<sequence_length)
00866                     value|=obs[j] << (max_val * (p_order-1-gap));
00867             }
00868         }
00869         obs[i]=value;
00870     }
00871 
00872     // TODO we should get rid of this loop!
00873     if (start>0)
00874     {
00875         for (i=start; i<sequence_length; i++)
00876             obs[i-start]=obs[i];
00877     }
00878 }
00879 
00880 template <class ST>
00881 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00882 {
00883     ASSERT(gap>=0);
00884 
00885     const int32_t start_gap=(p_order-gap)/2;
00886     const int32_t end_gap=start_gap+gap;
00887 
00888     int32_t i,j;
00889     ST value=0;
00890 
00891     // almost all positions
00892     for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00893     {
00894         value=0;
00895         for (j=i; j>=i-p_order+1; j--)
00896         {
00897             if (i-j<start_gap)
00898                 value= (value << max_val) | obs[j];
00899             else if (i-j>=end_gap)
00900                 value= (value << max_val) | obs[j];
00901         }
00902         obs[i]= (ST) value;
00903     }
00904 
00905     // the remaining `order` positions
00906     for (i=p_order-2;i>=0;i--)
00907     {
00908         if (i>=sequence_length)
00909             continue;
00910 
00911         value=0;
00912         for (j=i; j>=i-p_order+1; j--)
00913         {
00914             if (i-j<start_gap)
00915             {
00916                 value= value << max_val;
00917                 if (j>=0 && j<sequence_length)
00918                     value|=obs[j];
00919             }
00920             else if (i-j>=end_gap)
00921             {
00922                 value= value << max_val;
00923                 if (j>=0 && j<sequence_length)
00924                     value|=obs[j];
00925             }
00926         }
00927         obs[i]=value;
00928     }
00929 
00930     // TODO we should get rid of this loop!
00931     if (start>0)
00932     {
00933         for (i=start; i<sequence_length; i++)
00934             obs[i-start]=obs[i];
00935     }
00936 }
00937 
00938 template<> void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00939 {
00940 }
00941 
00942 template<> void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00943 {
00944 }
00945 
00946 template<> void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00947 {
00948 }
00949 
00950 template<> void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00951 {
00952 }
00953 
00954 template<> void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00955 {
00956 }
00957 
00958 template<> void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00959 {
00960 }
00961 
00962 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00963 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00964 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00965 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00966 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00967 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00968 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00969 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00970 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00971 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00972 
00973 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00974 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00975 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00976 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00977 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00978 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00979 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00980 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00981 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00982 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00983 
00984 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00985 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00986 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00987 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00988 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00989 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00990 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00991 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00992 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00993 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00994 
00995 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00996 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00997 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00998 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00999 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01000 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01001 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01002 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01003 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01004 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01005 template void CAlphabet::translate_from_single_order_reversed<float32_t>(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01006 template void CAlphabet::translate_from_single_order_reversed<float64_t>(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01007 template void CAlphabet::translate_from_single_order_reversed<floatmax_t>(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01008 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation