00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include <shogun/features/Alphabet.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/base/Parameter.h>
00017 
00018 using namespace shogun;
00019 
00020 
00021 const uint8_t CAlphabet::B_A=0;
00022 const uint8_t CAlphabet::B_C=1;
00023 const uint8_t CAlphabet::B_G=2;
00024 const uint8_t CAlphabet::B_T=3;
00025 const uint8_t CAlphabet::B_0=4;
00026 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00027 const char* CAlphabet::alphabet_names[18]={
00028     "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00029     "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00030     "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00031     "SNP", "RAWSNP"};
00032 
00033 
00034 CAlphabet::CAlphabet()
00035   : CSGObject()
00036 {
00037     init();
00038 }
00039 
00040 CAlphabet::CAlphabet(char* al, int32_t len)
00041 : CSGObject()
00042 {
00043     init();
00044 
00045     EAlphabet alpha=NONE;
00046 
00047     if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00048         alpha = DNA;
00049     else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00050         alpha = RAWDNA;
00051     else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00052         alpha = RNA;
00053     else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00054         alpha = PROTEIN;
00055     else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00056         alpha = BINARY;
00057     else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00058         alpha = ALPHANUM;
00059     else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00060         alpha = CUBE;
00061     else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00062         alpha = DIGIT2;
00063     else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00064         alpha = DIGIT;
00065     else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00066         alpha = RAWDIGIT2;
00067     else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00068         alpha = RAWDIGIT;
00069     else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00070         alpha = SNP;
00071     else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00072         alpha = RAWSNP;
00073     else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00074             (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00075         alpha = RAWBYTE;
00076     else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00077         alpha = IUPAC_NUCLEIC_ACID;
00078     else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00079         alpha = IUPAC_AMINO_ACID;
00080     else {
00081       SG_ERROR( "unknown alphabet %s\n", al);
00082    }
00083 
00084     set_alphabet(alpha);
00085 }
00086 
00087 CAlphabet::CAlphabet(EAlphabet alpha)
00088 : CSGObject()
00089 {
00090     init();
00091     set_alphabet(alpha);
00092 }
00093 
00094 CAlphabet::CAlphabet(CAlphabet* a)
00095 : CSGObject()
00096 {
00097     init();
00098     ASSERT(a);
00099     set_alphabet(a->get_alphabet());
00100     copy_histogram(a);
00101 }
00102 
00103 CAlphabet::~CAlphabet()
00104 {
00105 }
00106 
00107 bool CAlphabet::set_alphabet(EAlphabet alpha)
00108 {
00109     bool result=true;
00110     alphabet=alpha;
00111 
00112     switch (alphabet)
00113     {
00114         case DNA:
00115         case RAWDNA:
00116             num_symbols = 4;
00117             break;
00118         case RNA:
00119             num_symbols = 4;
00120             break;
00121         case PROTEIN:
00122             num_symbols = 26;
00123             break;
00124         case BINARY:
00125             num_symbols = 2;
00126             break;
00127         case ALPHANUM:
00128             num_symbols = 36;
00129             break;
00130         case CUBE:
00131             num_symbols = 6;
00132             break;
00133         case RAWBYTE:
00134             num_symbols = 256;
00135             break;
00136         case IUPAC_NUCLEIC_ACID:
00137             num_symbols = 16;
00138             break;
00139         case IUPAC_AMINO_ACID:
00140             num_symbols = 23;
00141             break;
00142         case NONE:
00143             num_symbols = 0;
00144             break;
00145         case DIGIT2:
00146             num_symbols = 3;
00147             break;
00148         case DIGIT:
00149             num_symbols = 10;
00150             break;
00151         case RAWDIGIT2:
00152             num_symbols = 3;
00153             break;
00154         case RAWDIGIT:
00155             num_symbols = 10;
00156             break;
00157         case SNP:
00158             num_symbols = 5;
00159             break;
00160         case RAWSNP:
00161             num_symbols = 5;
00162             break;
00163         default:
00164             num_symbols = 0;
00165             result=false;
00166             break;
00167     }
00168 
00169     num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00170     init_map_table();
00171     clear_histogram();
00172 
00173     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00174 
00175     return result;
00176 }
00177 
00178 void CAlphabet::init_map_table()
00179 {
00180     for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00181     {
00182         maptable_to_bin[i] = MAPTABLE_UNDEF;
00183         maptable_to_char[i] = MAPTABLE_UNDEF;
00184         valid_chars[i] = false;
00185     }
00186 
00187     switch (alphabet)
00188     {
00189         case RAWDIGIT:
00190             for (uint8_t i=0; i<=9; i++)
00191             {
00192                 valid_chars[i]=true;
00193                 maptable_to_bin[i]=i;
00194                 maptable_to_char[i]=i;
00195             }
00196             break;
00197 
00198         case RAWDIGIT2:
00199             for (uint8_t i=0; i<=2; i++)
00200             {
00201                 valid_chars[i]=true;
00202                 maptable_to_bin[i]=i;
00203                 maptable_to_char[i]=i;
00204             }
00205             break;
00206 
00207         case DIGIT:
00208             valid_chars[(uint8_t) '0']=true;
00209             valid_chars[(uint8_t) '1']=true;
00210             valid_chars[(uint8_t) '2']=true;
00211             valid_chars[(uint8_t) '3']=true;
00212             valid_chars[(uint8_t) '4']=true;
00213             valid_chars[(uint8_t) '5']=true;
00214             valid_chars[(uint8_t) '6']=true;
00215             valid_chars[(uint8_t) '7']=true;
00216             valid_chars[(uint8_t) '8']=true;
00217             valid_chars[(uint8_t) '9']=true;    
00218 
00219             maptable_to_bin[(uint8_t) '0']=0;
00220             maptable_to_bin[(uint8_t) '1']=1;
00221             maptable_to_bin[(uint8_t) '2']=2;
00222             maptable_to_bin[(uint8_t) '3']=3;
00223             maptable_to_bin[(uint8_t) '4']=4;
00224             maptable_to_bin[(uint8_t) '5']=5;
00225             maptable_to_bin[(uint8_t) '6']=6;
00226             maptable_to_bin[(uint8_t) '7']=7;
00227             maptable_to_bin[(uint8_t) '8']=8;
00228             maptable_to_bin[(uint8_t) '9']=9;   
00229 
00230             maptable_to_char[(uint8_t) 0]='0';
00231             maptable_to_char[(uint8_t) 1]='1';
00232             maptable_to_char[(uint8_t) 2]='2';
00233             maptable_to_char[(uint8_t) 3]='3';
00234             maptable_to_char[(uint8_t) 4]='4';
00235             maptable_to_char[(uint8_t) 5]='5';
00236             maptable_to_char[(uint8_t) 6]='6';
00237             maptable_to_char[(uint8_t) 7]='7';
00238             maptable_to_char[(uint8_t) 8]='8';
00239             maptable_to_char[(uint8_t) 9]='9';  
00240             break;
00241 
00242         case DIGIT2:
00243             valid_chars[(uint8_t) '0']=true;
00244             valid_chars[(uint8_t) '1']=true;
00245             valid_chars[(uint8_t) '2']=true; 
00246 
00247             maptable_to_bin[(uint8_t) '0']=0;
00248             maptable_to_bin[(uint8_t) '1']=1;
00249             maptable_to_bin[(uint8_t) '2']=2;   
00250 
00251             maptable_to_char[(uint8_t) 0]='0';
00252             maptable_to_char[(uint8_t) 1]='1';
00253             maptable_to_char[(uint8_t) 2]='2'; 
00254             break;
00255 
00256         case CUBE:
00257             valid_chars[(uint8_t) '1']=true;
00258             valid_chars[(uint8_t) '2']=true;
00259             valid_chars[(uint8_t) '3']=true;
00260             valid_chars[(uint8_t) '4']=true;
00261             valid_chars[(uint8_t) '5']=true;
00262             valid_chars[(uint8_t) '6']=true;    
00263 
00264             maptable_to_bin[(uint8_t) '1']=0;
00265             maptable_to_bin[(uint8_t) '2']=1;
00266             maptable_to_bin[(uint8_t) '3']=2;
00267             maptable_to_bin[(uint8_t) '4']=3;
00268             maptable_to_bin[(uint8_t) '5']=4;
00269             maptable_to_bin[(uint8_t) '6']=5;   
00270 
00271             maptable_to_char[(uint8_t) 0]='1';
00272             maptable_to_char[(uint8_t) 1]='2';
00273             maptable_to_char[(uint8_t) 2]='3';
00274             maptable_to_char[(uint8_t) 3]='4';
00275             maptable_to_char[(uint8_t) 4]='5';
00276             maptable_to_char[(uint8_t) 5]='6';  
00277             break;
00278 
00279         case PROTEIN:
00280             {
00281                 int32_t skip=0 ;
00282                 for (int32_t i=0; i<21; i++)
00283                 {
00284                     if (i==1) skip++ ;
00285                     if (i==8) skip++ ;
00286                     if (i==12) skip++ ;
00287                     if (i==17) skip++ ;
00288                     valid_chars['A'+i+skip]=true;
00289                     maptable_to_bin['A'+i+skip]=i ;
00290                     maptable_to_char[i]='A'+i+skip ;
00291                 } ;                   
00292             } ;
00293             break;
00294 
00295         case BINARY:
00296             valid_chars[(uint8_t) '0']=true;
00297             valid_chars[(uint8_t) '1']=true;
00298 
00299             maptable_to_bin[(uint8_t) '0']=0;
00300             maptable_to_bin[(uint8_t) '1']=1;
00301 
00302             maptable_to_char[0]=(uint8_t) '0';
00303             maptable_to_char[1]=(uint8_t) '1';
00304             break;
00305 
00306         case ALPHANUM:
00307             {
00308                 for (int32_t i=0; i<26; i++)
00309                 {
00310                     valid_chars['A'+i]=true;
00311                     maptable_to_bin['A'+i]=i ;
00312                     maptable_to_char[i]='A'+i ;
00313                 } ;
00314                 for (int32_t i=0; i<10; i++)
00315                 {
00316                     valid_chars['0'+i]=true;
00317                     maptable_to_bin['0'+i]=26+i ;
00318                     maptable_to_char[26+i]='0'+i ;
00319                 } ;        
00320             } ;
00321             break;
00322 
00323         case RAWBYTE:
00324             {
00325                 
00326                 for (int32_t i=0; i<256; i++)
00327                 {
00328                     valid_chars[i]=true;
00329                     maptable_to_bin[i]=i;
00330                     maptable_to_char[i]=i;
00331                 }
00332             }
00333             break;
00334 
00335         case DNA:
00336             valid_chars[(uint8_t) 'A']=true;
00337             valid_chars[(uint8_t) 'C']=true;
00338             valid_chars[(uint8_t) 'G']=true;
00339             valid_chars[(uint8_t) 'T']=true;
00340 
00341             maptable_to_bin[(uint8_t) 'A']=B_A;
00342             maptable_to_bin[(uint8_t) 'C']=B_C;
00343             maptable_to_bin[(uint8_t) 'G']=B_G;
00344             maptable_to_bin[(uint8_t) 'T']=B_T;
00345 
00346             maptable_to_char[B_A]='A';
00347             maptable_to_char[B_C]='C';
00348             maptable_to_char[B_G]='G';
00349             maptable_to_char[B_T]='T';
00350             break;
00351         case RAWDNA:
00352             {
00353                 
00354                 for (int32_t i=0; i<4; i++)
00355                 {
00356                     valid_chars[i]=true;
00357                     maptable_to_bin[i]=i;
00358                     maptable_to_char[i]=i;
00359                 }
00360             }
00361             break;
00362 
00363         case SNP:
00364             valid_chars[(uint8_t) 'A']=true;
00365             valid_chars[(uint8_t) 'C']=true;
00366             valid_chars[(uint8_t) 'G']=true;
00367             valid_chars[(uint8_t) 'T']=true;
00368             valid_chars[(uint8_t) '0']=true;
00369 
00370             maptable_to_bin[(uint8_t) 'A']=B_A;
00371             maptable_to_bin[(uint8_t) 'C']=B_C;
00372             maptable_to_bin[(uint8_t) 'G']=B_G;
00373             maptable_to_bin[(uint8_t) 'T']=B_T;
00374             maptable_to_bin[(uint8_t) '0']=B_0;
00375 
00376             maptable_to_char[B_A]='A';
00377             maptable_to_char[B_C]='C';
00378             maptable_to_char[B_G]='G';
00379             maptable_to_char[B_T]='T';
00380             maptable_to_char[B_0]='0';
00381             break;
00382         case RAWSNP:
00383             {
00384                 
00385                 for (int32_t i=0; i<5; i++)
00386                 {
00387                     valid_chars[i]=true;
00388                     maptable_to_bin[i]=i;
00389                     maptable_to_char[i]=i;
00390                 }
00391             }
00392             break;
00393 
00394         case RNA:
00395             valid_chars[(uint8_t) 'A']=true;
00396             valid_chars[(uint8_t) 'C']=true;
00397             valid_chars[(uint8_t) 'G']=true;
00398             valid_chars[(uint8_t) 'U']=true;
00399 
00400             maptable_to_bin[(uint8_t) 'A']=B_A;
00401             maptable_to_bin[(uint8_t) 'C']=B_C;
00402             maptable_to_bin[(uint8_t) 'G']=B_G;
00403             maptable_to_bin[(uint8_t) 'U']=B_T;
00404 
00405             maptable_to_char[B_A]='A';
00406             maptable_to_char[B_C]='C';
00407             maptable_to_char[B_G]='G';
00408             maptable_to_char[B_T]='U';
00409             break;
00410 
00411         case IUPAC_NUCLEIC_ACID:
00412             valid_chars[(uint8_t) 'A']=true; 
00413             valid_chars[(uint8_t) 'C']=true; 
00414             valid_chars[(uint8_t) 'G']=true; 
00415             valid_chars[(uint8_t) 'T']=true; 
00416             valid_chars[(uint8_t) 'U']=true; 
00417             valid_chars[(uint8_t) 'R']=true; 
00418             valid_chars[(uint8_t) 'Y']=true; 
00419             valid_chars[(uint8_t) 'M']=true; 
00420             valid_chars[(uint8_t) 'K']=true; 
00421             valid_chars[(uint8_t) 'W']=true; 
00422             valid_chars[(uint8_t) 'S']=true; 
00423             valid_chars[(uint8_t) 'B']=true; 
00424             valid_chars[(uint8_t) 'D']=true; 
00425             valid_chars[(uint8_t) 'H']=true; 
00426             valid_chars[(uint8_t) 'V']=true; 
00427             valid_chars[(uint8_t) 'N']=true; 
00428 
00429             maptable_to_bin[(uint8_t) 'A']=0; 
00430             maptable_to_bin[(uint8_t) 'C']=1; 
00431             maptable_to_bin[(uint8_t) 'G']=2; 
00432             maptable_to_bin[(uint8_t) 'T']=3; 
00433             maptable_to_bin[(uint8_t) 'U']=4; 
00434             maptable_to_bin[(uint8_t) 'R']=5; 
00435             maptable_to_bin[(uint8_t) 'Y']=6; 
00436             maptable_to_bin[(uint8_t) 'M']=7; 
00437             maptable_to_bin[(uint8_t) 'K']=8; 
00438             maptable_to_bin[(uint8_t) 'W']=9; 
00439             maptable_to_bin[(uint8_t) 'S']=10; 
00440             maptable_to_bin[(uint8_t) 'B']=11; 
00441             maptable_to_bin[(uint8_t) 'D']=12; 
00442             maptable_to_bin[(uint8_t) 'H']=13; 
00443             maptable_to_bin[(uint8_t) 'V']=14; 
00444             maptable_to_bin[(uint8_t) 'N']=15; 
00445 
00446             maptable_to_char[0]=(uint8_t) 'A'; 
00447             maptable_to_char[1]=(uint8_t) 'C'; 
00448             maptable_to_char[2]=(uint8_t) 'G'; 
00449             maptable_to_char[3]=(uint8_t) 'T'; 
00450             maptable_to_char[4]=(uint8_t) 'U'; 
00451             maptable_to_char[5]=(uint8_t) 'R'; 
00452             maptable_to_char[6]=(uint8_t) 'Y'; 
00453             maptable_to_char[7]=(uint8_t) 'M'; 
00454             maptable_to_char[8]=(uint8_t) 'K'; 
00455             maptable_to_char[9]=(uint8_t) 'W'; 
00456             maptable_to_char[10]=(uint8_t) 'S'; 
00457             maptable_to_char[11]=(uint8_t) 'B'; 
00458             maptable_to_char[12]=(uint8_t) 'D'; 
00459             maptable_to_char[13]=(uint8_t) 'H'; 
00460             maptable_to_char[14]=(uint8_t) 'V'; 
00461             maptable_to_char[15]=(uint8_t) 'N'; 
00462             break;
00463 
00464         case IUPAC_AMINO_ACID:
00465             valid_chars[(uint8_t) 'A']=true; 
00466             valid_chars[(uint8_t) 'R']=true; 
00467             valid_chars[(uint8_t) 'N']=true; 
00468             valid_chars[(uint8_t) 'D']=true; 
00469             valid_chars[(uint8_t) 'C']=true; 
00470             valid_chars[(uint8_t) 'Q']=true; 
00471             valid_chars[(uint8_t) 'E']=true; 
00472             valid_chars[(uint8_t) 'G']=true; 
00473             valid_chars[(uint8_t) 'H']=true; 
00474             valid_chars[(uint8_t) 'I']=true; 
00475             valid_chars[(uint8_t) 'L']=true; 
00476             valid_chars[(uint8_t) 'K']=true; 
00477             valid_chars[(uint8_t) 'M']=true; 
00478             valid_chars[(uint8_t) 'F']=true; 
00479             valid_chars[(uint8_t) 'P']=true; 
00480             valid_chars[(uint8_t) 'S']=true; 
00481             valid_chars[(uint8_t) 'T']=true; 
00482             valid_chars[(uint8_t) 'W']=true; 
00483             valid_chars[(uint8_t) 'Y']=true; 
00484             valid_chars[(uint8_t) 'V']=true; 
00485             valid_chars[(uint8_t) 'B']=true; 
00486             valid_chars[(uint8_t) 'Z']=true; 
00487             valid_chars[(uint8_t) 'X']=true; 
00488 
00489             maptable_to_bin[(uint8_t) 'A']=0;  
00490             maptable_to_bin[(uint8_t) 'R']=1;  
00491             maptable_to_bin[(uint8_t) 'N']=2;  
00492             maptable_to_bin[(uint8_t) 'D']=3;  
00493             maptable_to_bin[(uint8_t) 'C']=4;  
00494             maptable_to_bin[(uint8_t) 'Q']=5;  
00495             maptable_to_bin[(uint8_t) 'E']=6;  
00496             maptable_to_bin[(uint8_t) 'G']=7;  
00497             maptable_to_bin[(uint8_t) 'H']=8;  
00498             maptable_to_bin[(uint8_t) 'I']=9;  
00499             maptable_to_bin[(uint8_t) 'L']=10; 
00500             maptable_to_bin[(uint8_t) 'K']=11; 
00501             maptable_to_bin[(uint8_t) 'M']=12; 
00502             maptable_to_bin[(uint8_t) 'F']=13; 
00503             maptable_to_bin[(uint8_t) 'P']=14; 
00504             maptable_to_bin[(uint8_t) 'S']=15; 
00505             maptable_to_bin[(uint8_t) 'T']=16; 
00506             maptable_to_bin[(uint8_t) 'W']=17; 
00507             maptable_to_bin[(uint8_t) 'Y']=18; 
00508             maptable_to_bin[(uint8_t) 'V']=19; 
00509             maptable_to_bin[(uint8_t) 'B']=20; 
00510             maptable_to_bin[(uint8_t) 'Z']=21; 
00511             maptable_to_bin[(uint8_t) 'X']=22; 
00512 
00513             maptable_to_char[0]=(uint8_t) 'A';  
00514             maptable_to_char[1]=(uint8_t) 'R';  
00515             maptable_to_char[2]=(uint8_t) 'N';  
00516             maptable_to_char[3]=(uint8_t) 'D';  
00517             maptable_to_char[4]=(uint8_t) 'C';  
00518             maptable_to_char[5]=(uint8_t) 'Q';  
00519             maptable_to_char[6]=(uint8_t) 'E';  
00520             maptable_to_char[7]=(uint8_t) 'G';  
00521             maptable_to_char[8]=(uint8_t) 'H';  
00522             maptable_to_char[9]=(uint8_t) 'I';  
00523             maptable_to_char[10]=(uint8_t) 'L'; 
00524             maptable_to_char[11]=(uint8_t) 'K'; 
00525             maptable_to_char[12]=(uint8_t) 'M'; 
00526             maptable_to_char[13]=(uint8_t) 'F'; 
00527             maptable_to_char[14]=(uint8_t) 'P'; 
00528             maptable_to_char[15]=(uint8_t) 'S'; 
00529             maptable_to_char[16]=(uint8_t) 'T'; 
00530             maptable_to_char[17]=(uint8_t) 'W'; 
00531             maptable_to_char[18]=(uint8_t) 'Y'; 
00532             maptable_to_char[19]=(uint8_t) 'V'; 
00533             maptable_to_char[20]=(uint8_t) 'B'; 
00534             maptable_to_char[21]=(uint8_t) 'Z'; 
00535             maptable_to_char[22]=(uint8_t) 'X'; 
00536             break;
00537 
00538         default:
00539             break; 
00540     };
00541 }
00542 
00543 void CAlphabet::clear_histogram()
00544 {
00545     memset(histogram, 0, sizeof(histogram));
00546     print_histogram();
00547 }
00548 
00549 int32_t CAlphabet::get_max_value_in_histogram()
00550 {
00551     int32_t max_sym=-1;
00552     for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00553     {
00554         if (histogram[i])
00555         {
00556             max_sym=i;
00557             break;
00558         }
00559     }
00560 
00561     return max_sym;
00562 }
00563 
00564 int32_t CAlphabet::get_num_symbols_in_histogram()
00565 {
00566     int32_t num_sym=0;
00567     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00568     {
00569         if (histogram[i])
00570             num_sym++;
00571     }
00572 
00573     return num_sym;
00574 }
00575 
00576 int32_t CAlphabet::get_num_bits_in_histogram()
00577 {
00578     int32_t num_sym=get_num_symbols_in_histogram();
00579     if (num_sym>0)
00580         return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00581     else
00582         return 0;
00583 }
00584 
00585 void CAlphabet::print_histogram()
00586 {
00587     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00588     {
00589         if (histogram[i])
00590             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00591     }
00592 }
00593 
00594 bool CAlphabet::check_alphabet(bool print_error)
00595 {
00596     bool result = true;
00597 
00598     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00599     {
00600         if (histogram[i]>0 && valid_chars[i]==0)
00601         {
00602             result=false;
00603             break;
00604         }
00605     }
00606 
00607     if (!result && print_error)
00608     {
00609         print_histogram();
00610         SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00611     }
00612 
00613     return result;
00614 }
00615 
00616 bool CAlphabet::check_alphabet_size(bool print_error)
00617 {
00618     if (get_num_bits_in_histogram() > get_num_bits())
00619     {
00620         if (print_error)
00621         {
00622             print_histogram();
00623             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00624          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00625         }
00626         return false;
00627     }
00628     else
00629         return true;
00630 
00631 }
00632 
00633 void CAlphabet::copy_histogram(CAlphabet* a)
00634 {
00635     memcpy(histogram, a->get_histogram(), sizeof(histogram));
00636 }
00637 
00638 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00639 {
00640 
00641     int32_t idx;
00642     switch (alphabet)
00643     {
00644         case DNA:
00645             idx=0;
00646             break;
00647         case RAWDNA:
00648             idx=1;
00649             break;
00650         case RNA:
00651             idx=2;
00652             break;
00653         case PROTEIN:
00654             idx=3;
00655             break;
00656         case BINARY:
00657             idx=4;
00658             break;
00659         case ALPHANUM:
00660             idx=5;
00661             break;
00662         case CUBE:
00663             idx=6;
00664             break;
00665         case RAWBYTE:
00666             idx=7;
00667             break;
00668         case IUPAC_NUCLEIC_ACID:
00669             idx=8;
00670             break;
00671         case IUPAC_AMINO_ACID:
00672             idx=9;
00673             break;
00674         case NONE:
00675             idx=10;
00676             break;
00677         case DIGIT:
00678             idx=11;
00679             break;
00680         case DIGIT2:
00681             idx=12;
00682             break;
00683         default:
00684             idx=13;
00685             break;
00686     }
00687     return alphabet_names[idx];
00688 }
00689 
00690 void CAlphabet::init()
00691 {
00692     alphabet = NONE;
00693     num_symbols = 0;
00694     num_bits = 0;
00695 
00696     memset(valid_chars, 0, sizeof (valid_chars));
00697     memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
00698     memset(maptable_to_char, 0, sizeof (maptable_to_char));
00699     memset(histogram, 0, sizeof (histogram));
00700 
00701 
00702     m_parameters->add((machine_int_t*) &alphabet, "alphabet",
00703                       "Alphabet enum.");
00704     m_parameters->add(&num_symbols, "num_symbols",
00705                       "Number of symbols.");
00706     m_parameters->add(&num_bits, "num_bits",
00707                       "Number of bits.");
00708 
00709     
00710 
00711 
00712 
00713     
00714 
00715 
00716 }
00717 
00718 void CAlphabet::load_serializable_post(void) throw (ShogunException)
00719 {
00720     CSGObject::load_serializable_post();
00721 
00722     init_map_table();
00723 }