00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <math.h>
00013
00014 #include <shogun/features/Alphabet.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/base/Parameter.h>
00017
00018 using namespace shogun;
00019
00020
00021 const uint8_t CAlphabet::B_A=0;
00022 const uint8_t CAlphabet::B_C=1;
00023 const uint8_t CAlphabet::B_G=2;
00024 const uint8_t CAlphabet::B_T=3;
00025 const uint8_t CAlphabet::B_0=4;
00026 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00027 const char* CAlphabet::alphabet_names[18]={
00028 "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00029 "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00030 "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00031 "SNP", "RAWSNP"};
00032
00033
00034 CAlphabet::CAlphabet()
00035 : CSGObject()
00036 {
00037 init();
00038 }
00039
00040 CAlphabet::CAlphabet(char* al, int32_t len)
00041 : CSGObject()
00042 {
00043 init();
00044
00045 EAlphabet alpha=NONE;
00046
00047 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00048 alpha = DNA;
00049 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00050 alpha = RAWDNA;
00051 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00052 alpha = RNA;
00053 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00054 alpha = PROTEIN;
00055 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00056 alpha = BINARY;
00057 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00058 alpha = ALPHANUM;
00059 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00060 alpha = CUBE;
00061 else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00062 alpha = DIGIT2;
00063 else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00064 alpha = DIGIT;
00065 else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00066 alpha = RAWDIGIT2;
00067 else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00068 alpha = RAWDIGIT;
00069 else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00070 alpha = SNP;
00071 else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00072 alpha = RAWSNP;
00073 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00074 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00075 alpha = RAWBYTE;
00076 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00077 alpha = IUPAC_NUCLEIC_ACID;
00078 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00079 alpha = IUPAC_AMINO_ACID;
00080 else {
00081 SG_ERROR( "unknown alphabet %s\n", al);
00082 }
00083
00084 set_alphabet(alpha);
00085 }
00086
00087 CAlphabet::CAlphabet(EAlphabet alpha)
00088 : CSGObject()
00089 {
00090 init();
00091 set_alphabet(alpha);
00092 }
00093
00094 CAlphabet::CAlphabet(CAlphabet* a)
00095 : CSGObject()
00096 {
00097 init();
00098 ASSERT(a);
00099 set_alphabet(a->get_alphabet());
00100 copy_histogram(a);
00101 }
00102
00103 CAlphabet::~CAlphabet()
00104 {
00105 }
00106
00107 bool CAlphabet::set_alphabet(EAlphabet alpha)
00108 {
00109 bool result=true;
00110 alphabet=alpha;
00111
00112 switch (alphabet)
00113 {
00114 case DNA:
00115 case RAWDNA:
00116 num_symbols = 4;
00117 break;
00118 case RNA:
00119 num_symbols = 4;
00120 break;
00121 case PROTEIN:
00122 num_symbols = 26;
00123 break;
00124 case BINARY:
00125 num_symbols = 2;
00126 break;
00127 case ALPHANUM:
00128 num_symbols = 36;
00129 break;
00130 case CUBE:
00131 num_symbols = 6;
00132 break;
00133 case RAWBYTE:
00134 num_symbols = 256;
00135 break;
00136 case IUPAC_NUCLEIC_ACID:
00137 num_symbols = 16;
00138 break;
00139 case IUPAC_AMINO_ACID:
00140 num_symbols = 23;
00141 break;
00142 case NONE:
00143 num_symbols = 0;
00144 break;
00145 case DIGIT2:
00146 num_symbols = 3;
00147 break;
00148 case DIGIT:
00149 num_symbols = 10;
00150 break;
00151 case RAWDIGIT2:
00152 num_symbols = 3;
00153 break;
00154 case RAWDIGIT:
00155 num_symbols = 10;
00156 break;
00157 case SNP:
00158 num_symbols = 5;
00159 break;
00160 case RAWSNP:
00161 num_symbols = 5;
00162 break;
00163 default:
00164 num_symbols = 0;
00165 result=false;
00166 break;
00167 }
00168
00169 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00170 init_map_table();
00171 clear_histogram();
00172
00173 SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00174
00175 return result;
00176 }
00177
00178 void CAlphabet::init_map_table()
00179 {
00180 for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00181 {
00182 maptable_to_bin[i] = MAPTABLE_UNDEF;
00183 maptable_to_char[i] = MAPTABLE_UNDEF;
00184 valid_chars[i] = false;
00185 }
00186
00187 switch (alphabet)
00188 {
00189 case RAWDIGIT:
00190 for (uint8_t i=0; i<=9; i++)
00191 {
00192 valid_chars[i]=true;
00193 maptable_to_bin[i]=i;
00194 maptable_to_char[i]=i;
00195 }
00196 break;
00197
00198 case RAWDIGIT2:
00199 for (uint8_t i=0; i<=2; i++)
00200 {
00201 valid_chars[i]=true;
00202 maptable_to_bin[i]=i;
00203 maptable_to_char[i]=i;
00204 }
00205 break;
00206
00207 case DIGIT:
00208 valid_chars[(uint8_t) '0']=true;
00209 valid_chars[(uint8_t) '1']=true;
00210 valid_chars[(uint8_t) '2']=true;
00211 valid_chars[(uint8_t) '3']=true;
00212 valid_chars[(uint8_t) '4']=true;
00213 valid_chars[(uint8_t) '5']=true;
00214 valid_chars[(uint8_t) '6']=true;
00215 valid_chars[(uint8_t) '7']=true;
00216 valid_chars[(uint8_t) '8']=true;
00217 valid_chars[(uint8_t) '9']=true;
00218
00219 maptable_to_bin[(uint8_t) '0']=0;
00220 maptable_to_bin[(uint8_t) '1']=1;
00221 maptable_to_bin[(uint8_t) '2']=2;
00222 maptable_to_bin[(uint8_t) '3']=3;
00223 maptable_to_bin[(uint8_t) '4']=4;
00224 maptable_to_bin[(uint8_t) '5']=5;
00225 maptable_to_bin[(uint8_t) '6']=6;
00226 maptable_to_bin[(uint8_t) '7']=7;
00227 maptable_to_bin[(uint8_t) '8']=8;
00228 maptable_to_bin[(uint8_t) '9']=9;
00229
00230 maptable_to_char[(uint8_t) 0]='0';
00231 maptable_to_char[(uint8_t) 1]='1';
00232 maptable_to_char[(uint8_t) 2]='2';
00233 maptable_to_char[(uint8_t) 3]='3';
00234 maptable_to_char[(uint8_t) 4]='4';
00235 maptable_to_char[(uint8_t) 5]='5';
00236 maptable_to_char[(uint8_t) 6]='6';
00237 maptable_to_char[(uint8_t) 7]='7';
00238 maptable_to_char[(uint8_t) 8]='8';
00239 maptable_to_char[(uint8_t) 9]='9';
00240 break;
00241
00242 case DIGIT2:
00243 valid_chars[(uint8_t) '0']=true;
00244 valid_chars[(uint8_t) '1']=true;
00245 valid_chars[(uint8_t) '2']=true;
00246
00247 maptable_to_bin[(uint8_t) '0']=0;
00248 maptable_to_bin[(uint8_t) '1']=1;
00249 maptable_to_bin[(uint8_t) '2']=2;
00250
00251 maptable_to_char[(uint8_t) 0]='0';
00252 maptable_to_char[(uint8_t) 1]='1';
00253 maptable_to_char[(uint8_t) 2]='2';
00254 break;
00255
00256 case CUBE:
00257 valid_chars[(uint8_t) '1']=true;
00258 valid_chars[(uint8_t) '2']=true;
00259 valid_chars[(uint8_t) '3']=true;
00260 valid_chars[(uint8_t) '4']=true;
00261 valid_chars[(uint8_t) '5']=true;
00262 valid_chars[(uint8_t) '6']=true;
00263
00264 maptable_to_bin[(uint8_t) '1']=0;
00265 maptable_to_bin[(uint8_t) '2']=1;
00266 maptable_to_bin[(uint8_t) '3']=2;
00267 maptable_to_bin[(uint8_t) '4']=3;
00268 maptable_to_bin[(uint8_t) '5']=4;
00269 maptable_to_bin[(uint8_t) '6']=5;
00270
00271 maptable_to_char[(uint8_t) 0]='1';
00272 maptable_to_char[(uint8_t) 1]='2';
00273 maptable_to_char[(uint8_t) 2]='3';
00274 maptable_to_char[(uint8_t) 3]='4';
00275 maptable_to_char[(uint8_t) 4]='5';
00276 maptable_to_char[(uint8_t) 5]='6';
00277 break;
00278
00279 case PROTEIN:
00280 {
00281 int32_t skip=0 ;
00282 for (int32_t i=0; i<21; i++)
00283 {
00284 if (i==1) skip++ ;
00285 if (i==8) skip++ ;
00286 if (i==12) skip++ ;
00287 if (i==17) skip++ ;
00288 valid_chars['A'+i+skip]=true;
00289 maptable_to_bin['A'+i+skip]=i ;
00290 maptable_to_char[i]='A'+i+skip ;
00291 } ;
00292 } ;
00293 break;
00294
00295 case BINARY:
00296 valid_chars[(uint8_t) '0']=true;
00297 valid_chars[(uint8_t) '1']=true;
00298
00299 maptable_to_bin[(uint8_t) '0']=0;
00300 maptable_to_bin[(uint8_t) '1']=1;
00301
00302 maptable_to_char[0]=(uint8_t) '0';
00303 maptable_to_char[1]=(uint8_t) '1';
00304 break;
00305
00306 case ALPHANUM:
00307 {
00308 for (int32_t i=0; i<26; i++)
00309 {
00310 valid_chars[(uint8_t) 'A'+i]=true;
00311 maptable_to_bin[(uint8_t) 'A'+i]=i ;
00312 maptable_to_char[i]='A'+i ;
00313 } ;
00314 for (int32_t i=0; i<10; i++)
00315 {
00316 valid_chars[(uint8_t) '0'+i]=true;
00317 maptable_to_bin[(uint8_t) '0'+i]=26+i ;
00318 maptable_to_char[26+i]='0'+i ;
00319 } ;
00320 } ;
00321 break;
00322
00323 case RAWBYTE:
00324 {
00325
00326 for (int32_t i=0; i<256; i++)
00327 {
00328 valid_chars[i]=true;
00329 maptable_to_bin[i]=i;
00330 maptable_to_char[i]=i;
00331 }
00332 }
00333 break;
00334
00335 case DNA:
00336 valid_chars[(uint8_t) 'A']=true;
00337 valid_chars[(uint8_t) 'C']=true;
00338 valid_chars[(uint8_t) 'G']=true;
00339 valid_chars[(uint8_t) 'T']=true;
00340
00341 maptable_to_bin[(uint8_t) 'A']=B_A;
00342 maptable_to_bin[(uint8_t) 'C']=B_C;
00343 maptable_to_bin[(uint8_t) 'G']=B_G;
00344 maptable_to_bin[(uint8_t) 'T']=B_T;
00345
00346 maptable_to_char[B_A]='A';
00347 maptable_to_char[B_C]='C';
00348 maptable_to_char[B_G]='G';
00349 maptable_to_char[B_T]='T';
00350 break;
00351 case RAWDNA:
00352 {
00353
00354 for (int32_t i=0; i<4; i++)
00355 {
00356 valid_chars[i]=true;
00357 maptable_to_bin[i]=i;
00358 maptable_to_char[i]=i;
00359 }
00360 }
00361 break;
00362
00363 case SNP:
00364 valid_chars[(uint8_t) 'A']=true;
00365 valid_chars[(uint8_t) 'C']=true;
00366 valid_chars[(uint8_t) 'G']=true;
00367 valid_chars[(uint8_t) 'T']=true;
00368 valid_chars[(uint8_t) '0']=true;
00369
00370 maptable_to_bin[(uint8_t) 'A']=B_A;
00371 maptable_to_bin[(uint8_t) 'C']=B_C;
00372 maptable_to_bin[(uint8_t) 'G']=B_G;
00373 maptable_to_bin[(uint8_t) 'T']=B_T;
00374 maptable_to_bin[(uint8_t) '0']=B_0;
00375
00376 maptable_to_char[B_A]='A';
00377 maptable_to_char[B_C]='C';
00378 maptable_to_char[B_G]='G';
00379 maptable_to_char[B_T]='T';
00380 maptable_to_char[B_0]='0';
00381 break;
00382 case RAWSNP:
00383 {
00384
00385 for (int32_t i=0; i<5; i++)
00386 {
00387 valid_chars[i]=true;
00388 maptable_to_bin[i]=i;
00389 maptable_to_char[i]=i;
00390 }
00391 }
00392 break;
00393
00394 case RNA:
00395 valid_chars[(uint8_t) 'A']=true;
00396 valid_chars[(uint8_t) 'C']=true;
00397 valid_chars[(uint8_t) 'G']=true;
00398 valid_chars[(uint8_t) 'U']=true;
00399
00400 maptable_to_bin[(uint8_t) 'A']=B_A;
00401 maptable_to_bin[(uint8_t) 'C']=B_C;
00402 maptable_to_bin[(uint8_t) 'G']=B_G;
00403 maptable_to_bin[(uint8_t) 'U']=B_T;
00404
00405 maptable_to_char[B_A]='A';
00406 maptable_to_char[B_C]='C';
00407 maptable_to_char[B_G]='G';
00408 maptable_to_char[B_T]='U';
00409 break;
00410
00411 case IUPAC_NUCLEIC_ACID:
00412 valid_chars[(uint8_t) 'A']=true;
00413 valid_chars[(uint8_t) 'C']=true;
00414 valid_chars[(uint8_t) 'G']=true;
00415 valid_chars[(uint8_t) 'T']=true;
00416 valid_chars[(uint8_t) 'U']=true;
00417 valid_chars[(uint8_t) 'R']=true;
00418 valid_chars[(uint8_t) 'Y']=true;
00419 valid_chars[(uint8_t) 'M']=true;
00420 valid_chars[(uint8_t) 'K']=true;
00421 valid_chars[(uint8_t) 'W']=true;
00422 valid_chars[(uint8_t) 'S']=true;
00423 valid_chars[(uint8_t) 'B']=true;
00424 valid_chars[(uint8_t) 'D']=true;
00425 valid_chars[(uint8_t) 'H']=true;
00426 valid_chars[(uint8_t) 'V']=true;
00427 valid_chars[(uint8_t) 'N']=true;
00428
00429 maptable_to_bin[(uint8_t) 'A']=0;
00430 maptable_to_bin[(uint8_t) 'C']=1;
00431 maptable_to_bin[(uint8_t) 'G']=2;
00432 maptable_to_bin[(uint8_t) 'T']=3;
00433 maptable_to_bin[(uint8_t) 'U']=4;
00434 maptable_to_bin[(uint8_t) 'R']=5;
00435 maptable_to_bin[(uint8_t) 'Y']=6;
00436 maptable_to_bin[(uint8_t) 'M']=7;
00437 maptable_to_bin[(uint8_t) 'K']=8;
00438 maptable_to_bin[(uint8_t) 'W']=9;
00439 maptable_to_bin[(uint8_t) 'S']=10;
00440 maptable_to_bin[(uint8_t) 'B']=11;
00441 maptable_to_bin[(uint8_t) 'D']=12;
00442 maptable_to_bin[(uint8_t) 'H']=13;
00443 maptable_to_bin[(uint8_t) 'V']=14;
00444 maptable_to_bin[(uint8_t) 'N']=15;
00445
00446 maptable_to_char[0]=(uint8_t) 'A';
00447 maptable_to_char[1]=(uint8_t) 'C';
00448 maptable_to_char[2]=(uint8_t) 'G';
00449 maptable_to_char[3]=(uint8_t) 'T';
00450 maptable_to_char[4]=(uint8_t) 'U';
00451 maptable_to_char[5]=(uint8_t) 'R';
00452 maptable_to_char[6]=(uint8_t) 'Y';
00453 maptable_to_char[7]=(uint8_t) 'M';
00454 maptable_to_char[8]=(uint8_t) 'K';
00455 maptable_to_char[9]=(uint8_t) 'W';
00456 maptable_to_char[10]=(uint8_t) 'S';
00457 maptable_to_char[11]=(uint8_t) 'B';
00458 maptable_to_char[12]=(uint8_t) 'D';
00459 maptable_to_char[13]=(uint8_t) 'H';
00460 maptable_to_char[14]=(uint8_t) 'V';
00461 maptable_to_char[15]=(uint8_t) 'N';
00462 break;
00463
00464 case IUPAC_AMINO_ACID:
00465 valid_chars[(uint8_t) 'A']=true;
00466 valid_chars[(uint8_t) 'R']=true;
00467 valid_chars[(uint8_t) 'N']=true;
00468 valid_chars[(uint8_t) 'D']=true;
00469 valid_chars[(uint8_t) 'C']=true;
00470 valid_chars[(uint8_t) 'Q']=true;
00471 valid_chars[(uint8_t) 'E']=true;
00472 valid_chars[(uint8_t) 'G']=true;
00473 valid_chars[(uint8_t) 'H']=true;
00474 valid_chars[(uint8_t) 'I']=true;
00475 valid_chars[(uint8_t) 'L']=true;
00476 valid_chars[(uint8_t) 'K']=true;
00477 valid_chars[(uint8_t) 'M']=true;
00478 valid_chars[(uint8_t) 'F']=true;
00479 valid_chars[(uint8_t) 'P']=true;
00480 valid_chars[(uint8_t) 'S']=true;
00481 valid_chars[(uint8_t) 'T']=true;
00482 valid_chars[(uint8_t) 'W']=true;
00483 valid_chars[(uint8_t) 'Y']=true;
00484 valid_chars[(uint8_t) 'V']=true;
00485 valid_chars[(uint8_t) 'B']=true;
00486 valid_chars[(uint8_t) 'Z']=true;
00487 valid_chars[(uint8_t) 'X']=true;
00488
00489 maptable_to_bin[(uint8_t) 'A']=0;
00490 maptable_to_bin[(uint8_t) 'R']=1;
00491 maptable_to_bin[(uint8_t) 'N']=2;
00492 maptable_to_bin[(uint8_t) 'D']=3;
00493 maptable_to_bin[(uint8_t) 'C']=4;
00494 maptable_to_bin[(uint8_t) 'Q']=5;
00495 maptable_to_bin[(uint8_t) 'E']=6;
00496 maptable_to_bin[(uint8_t) 'G']=7;
00497 maptable_to_bin[(uint8_t) 'H']=8;
00498 maptable_to_bin[(uint8_t) 'I']=9;
00499 maptable_to_bin[(uint8_t) 'L']=10;
00500 maptable_to_bin[(uint8_t) 'K']=11;
00501 maptable_to_bin[(uint8_t) 'M']=12;
00502 maptable_to_bin[(uint8_t) 'F']=13;
00503 maptable_to_bin[(uint8_t) 'P']=14;
00504 maptable_to_bin[(uint8_t) 'S']=15;
00505 maptable_to_bin[(uint8_t) 'T']=16;
00506 maptable_to_bin[(uint8_t) 'W']=17;
00507 maptable_to_bin[(uint8_t) 'Y']=18;
00508 maptable_to_bin[(uint8_t) 'V']=19;
00509 maptable_to_bin[(uint8_t) 'B']=20;
00510 maptable_to_bin[(uint8_t) 'Z']=21;
00511 maptable_to_bin[(uint8_t) 'X']=22;
00512
00513 maptable_to_char[0]=(uint8_t) 'A';
00514 maptable_to_char[1]=(uint8_t) 'R';
00515 maptable_to_char[2]=(uint8_t) 'N';
00516 maptable_to_char[3]=(uint8_t) 'D';
00517 maptable_to_char[4]=(uint8_t) 'C';
00518 maptable_to_char[5]=(uint8_t) 'Q';
00519 maptable_to_char[6]=(uint8_t) 'E';
00520 maptable_to_char[7]=(uint8_t) 'G';
00521 maptable_to_char[8]=(uint8_t) 'H';
00522 maptable_to_char[9]=(uint8_t) 'I';
00523 maptable_to_char[10]=(uint8_t) 'L';
00524 maptable_to_char[11]=(uint8_t) 'K';
00525 maptable_to_char[12]=(uint8_t) 'M';
00526 maptable_to_char[13]=(uint8_t) 'F';
00527 maptable_to_char[14]=(uint8_t) 'P';
00528 maptable_to_char[15]=(uint8_t) 'S';
00529 maptable_to_char[16]=(uint8_t) 'T';
00530 maptable_to_char[17]=(uint8_t) 'W';
00531 maptable_to_char[18]=(uint8_t) 'Y';
00532 maptable_to_char[19]=(uint8_t) 'V';
00533 maptable_to_char[20]=(uint8_t) 'B';
00534 maptable_to_char[21]=(uint8_t) 'Z';
00535 maptable_to_char[22]=(uint8_t) 'X';
00536 break;
00537
00538 default:
00539 break;
00540 };
00541 }
00542
00543 void CAlphabet::clear_histogram()
00544 {
00545 memset(histogram, 0, sizeof(histogram));
00546 print_histogram();
00547 }
00548
00549 int32_t CAlphabet::get_max_value_in_histogram()
00550 {
00551 int32_t max_sym=-1;
00552 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00553 {
00554 if (histogram[i])
00555 {
00556 max_sym=i;
00557 break;
00558 }
00559 }
00560
00561 return max_sym;
00562 }
00563
00564 int32_t CAlphabet::get_num_symbols_in_histogram()
00565 {
00566 int32_t num_sym=0;
00567 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00568 {
00569 if (histogram[i])
00570 num_sym++;
00571 }
00572
00573 return num_sym;
00574 }
00575
00576 int32_t CAlphabet::get_num_bits_in_histogram()
00577 {
00578 int32_t num_sym=get_num_symbols_in_histogram();
00579 if (num_sym>0)
00580 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00581 else
00582 return 0;
00583 }
00584
00585
00586 void CAlphabet::print_histogram()
00587 {
00588 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00589 {
00590 if (histogram[i])
00591 SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00592 }
00593 }
00594
00595 SGVector<int64_t> CAlphabet::get_histogram()
00596 {
00597 return SGVector<int64_t>(&histogram[0], 1 << (sizeof(uint8_t)*8), false);
00598 }
00599
00600 bool CAlphabet::check_alphabet(bool print_error)
00601 {
00602 bool result = true;
00603
00604 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00605 {
00606 if (histogram[i]>0 && valid_chars[i]==0)
00607 {
00608 result=false;
00609 break;
00610 }
00611 }
00612
00613 if (!result && print_error)
00614 {
00615 print_histogram();
00616 SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00617 }
00618
00619 return result;
00620 }
00621
00622 bool CAlphabet::check_alphabet_size(bool print_error)
00623 {
00624 if (get_num_bits_in_histogram() > get_num_bits())
00625 {
00626 if (print_error)
00627 {
00628 print_histogram();
00629 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00630 SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00631 }
00632 return false;
00633 }
00634 else
00635 return true;
00636
00637 }
00638
00639 void CAlphabet::copy_histogram(CAlphabet* a)
00640 {
00641 SGVector<int64_t> h=a->get_histogram();
00642
00643 if (h.vlen != sizeof(histogram)/sizeof(histogram[0]))
00644 {
00645 SG_ERROR("Histogram has %d elements, but %d elements where expected\n",
00646 h.vlen, sizeof(histogram)/sizeof(histogram[0]));
00647 }
00648
00649 memcpy(histogram, h.vector, sizeof(histogram));
00650 }
00651
00652 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00653 {
00654
00655 int32_t idx;
00656 switch (alphabet)
00657 {
00658 case DNA:
00659 idx=0;
00660 break;
00661 case RAWDNA:
00662 idx=1;
00663 break;
00664 case RNA:
00665 idx=2;
00666 break;
00667 case PROTEIN:
00668 idx=3;
00669 break;
00670 case BINARY:
00671 idx=4;
00672 break;
00673 case ALPHANUM:
00674 idx=5;
00675 break;
00676 case CUBE:
00677 idx=6;
00678 break;
00679 case RAWBYTE:
00680 idx=7;
00681 break;
00682 case IUPAC_NUCLEIC_ACID:
00683 idx=8;
00684 break;
00685 case IUPAC_AMINO_ACID:
00686 idx=9;
00687 break;
00688 case NONE:
00689 idx=10;
00690 break;
00691 case DIGIT:
00692 idx=11;
00693 break;
00694 case DIGIT2:
00695 idx=12;
00696 break;
00697 default:
00698 idx=13;
00699 break;
00700 }
00701 return alphabet_names[idx];
00702 }
00703
00704 void CAlphabet::init()
00705 {
00706 alphabet = NONE;
00707 num_symbols = 0;
00708 num_bits = 0;
00709
00710 memset(valid_chars, 0, sizeof (valid_chars));
00711 memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
00712 memset(maptable_to_char, 0, sizeof (maptable_to_char));
00713 memset(histogram, 0, sizeof (histogram));
00714
00715
00716 m_parameters->add((machine_int_t*) &alphabet, "alphabet",
00717 "Alphabet enum.");
00718 m_parameters->add(&num_symbols, "num_symbols",
00719 "Number of symbols.");
00720 m_parameters->add(&num_bits, "num_bits",
00721 "Number of bits.");
00722
00723
00724
00725
00726
00727
00728
00729
00730 }
00731
00732 void CAlphabet::load_serializable_post() throw (ShogunException)
00733 {
00734 CSGObject::load_serializable_post();
00735
00736 init_map_table();
00737 }
00738
00739
00740 namespace shogun
00741 {
00742 template <class ST>
00743 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00744 {
00745 int32_t i,j;
00746 ST value=0;
00747
00748 for (i=sequence_length-1; i>= p_order-1; i--)
00749 {
00750 value=0;
00751 for (j=i; j>=i-p_order+1; j--)
00752 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00753
00754 obs[i]= (ST) value;
00755 }
00756
00757 for (i=p_order-2;i>=0;i--)
00758 {
00759 if (i>=sequence_length)
00760 continue;
00761
00762 value=0;
00763 for (j=i; j>=i-p_order+1; j--)
00764 {
00765 value= (value >> max_val);
00766 if (j>=0 && j<sequence_length)
00767 value|=obs[j] << (max_val * (p_order-1));
00768 }
00769 obs[i]=value;
00770 }
00771
00772
00773 if (start>0)
00774 {
00775 for (i=start; i<sequence_length; i++)
00776 obs[i-start]=obs[i];
00777 }
00778 }
00779
00780 template <class ST>
00781 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00782 {
00783 int32_t i,j;
00784 ST value=0;
00785
00786 for (i=sequence_length-1; i>= p_order-1; i--)
00787 {
00788 value=0;
00789 for (j=i; j>=i-p_order+1; j--)
00790 value= (value << max_val) | obs[j];
00791
00792 obs[i]= (ST) value;
00793 }
00794
00795 for (i=p_order-2;i>=0;i--)
00796 {
00797 if (i>=sequence_length)
00798 continue;
00799
00800 value=0;
00801 for (j=i; j>=i-p_order+1; j--)
00802 {
00803 value= (value << max_val);
00804 if (j>=0 && j<sequence_length)
00805 value|=obs[j];
00806 }
00807 obs[i]=value;
00808 }
00809
00810
00811 if (start>0)
00812 {
00813 for (i=start; i<sequence_length; i++)
00814 obs[i-start]=obs[i];
00815 }
00816 }
00817
00818 template <class ST>
00819 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00820 {
00821 ASSERT(gap>=0);
00822
00823 const int32_t start_gap=(p_order-gap)/2;
00824 const int32_t end_gap=start_gap+gap;
00825
00826 int32_t i,j;
00827 ST value=0;
00828
00829
00830 for (i=sequence_length-1; i>=p_order-1; i--)
00831 {
00832 value=0;
00833 for (j=i; j>=i-p_order+1; j--)
00834 {
00835 if (i-j<start_gap)
00836 {
00837 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00838 }
00839 else if (i-j>=end_gap)
00840 {
00841 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00842 }
00843 }
00844 obs[i]= (ST) value;
00845 }
00846
00847
00848 for (i=p_order-2;i>=0;i--)
00849 {
00850 if (i>=sequence_length)
00851 continue;
00852
00853 value=0;
00854 for (j=i; j>=i-p_order+1; j--)
00855 {
00856 if (i-j<start_gap)
00857 {
00858 value= (value >> max_val);
00859 if (j>=0 && j<sequence_length)
00860 value|=obs[j] << (max_val * (p_order-1-gap));
00861 }
00862 else if (i-j>=end_gap)
00863 {
00864 value= (value >> max_val);
00865 if (j>=0 && j<sequence_length)
00866 value|=obs[j] << (max_val * (p_order-1-gap));
00867 }
00868 }
00869 obs[i]=value;
00870 }
00871
00872
00873 if (start>0)
00874 {
00875 for (i=start; i<sequence_length; i++)
00876 obs[i-start]=obs[i];
00877 }
00878 }
00879
00880 template <class ST>
00881 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00882 {
00883 ASSERT(gap>=0);
00884
00885 const int32_t start_gap=(p_order-gap)/2;
00886 const int32_t end_gap=start_gap+gap;
00887
00888 int32_t i,j;
00889 ST value=0;
00890
00891
00892 for (i=sequence_length-1; i>=p_order-1; i--)
00893 {
00894 value=0;
00895 for (j=i; j>=i-p_order+1; j--)
00896 {
00897 if (i-j<start_gap)
00898 value= (value << max_val) | obs[j];
00899 else if (i-j>=end_gap)
00900 value= (value << max_val) | obs[j];
00901 }
00902 obs[i]= (ST) value;
00903 }
00904
00905
00906 for (i=p_order-2;i>=0;i--)
00907 {
00908 if (i>=sequence_length)
00909 continue;
00910
00911 value=0;
00912 for (j=i; j>=i-p_order+1; j--)
00913 {
00914 if (i-j<start_gap)
00915 {
00916 value= value << max_val;
00917 if (j>=0 && j<sequence_length)
00918 value|=obs[j];
00919 }
00920 else if (i-j>=end_gap)
00921 {
00922 value= value << max_val;
00923 if (j>=0 && j<sequence_length)
00924 value|=obs[j];
00925 }
00926 }
00927 obs[i]=value;
00928 }
00929
00930
00931 if (start>0)
00932 {
00933 for (i=start; i<sequence_length; i++)
00934 obs[i-start]=obs[i];
00935 }
00936 }
00937
00938 template<> void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00939 {
00940 }
00941
00942 template<> void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00943 {
00944 }
00945
00946 template<> void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00947 {
00948 }
00949
00950 template<> void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00951 {
00952 }
00953
00954 template<> void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00955 {
00956 }
00957
00958 template<> void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00959 {
00960 }
00961
00962 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00963 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00964 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00965 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00966 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00967 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00968 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00969 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00970 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00971 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00972
00973 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00974 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00975 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00976 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00977 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00978 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00979 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00980 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00981 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00982 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00983
00984 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00985 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00986 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00987 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00988 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00989 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00990 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00991 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00992 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00993 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00994
00995 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00996 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00997 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00998 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00999 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01000 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01001 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01002 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01003 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01004 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01005 template void CAlphabet::translate_from_single_order_reversed<float32_t>(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01006 template void CAlphabet::translate_from_single_order_reversed<float64_t>(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01007 template void CAlphabet::translate_from_single_order_reversed<floatmax_t>(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
01008 }