SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Alphabet.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <string.h>
12 #include <math.h>
13 
15 #include <shogun/io/SGIO.h>
16 #include <shogun/base/Parameter.h>
17 
18 using namespace shogun;
19 
20 //define numbers for the bases
21 const uint8_t CAlphabet::B_A=0;
22 const uint8_t CAlphabet::B_C=1;
23 const uint8_t CAlphabet::B_G=2;
24 const uint8_t CAlphabet::B_T=3;
25 const uint8_t CAlphabet::B_0=4;
26 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
27 const char* CAlphabet::alphabet_names[18]={
28  "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
29  "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
30  "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
31  "SNP", "RAWSNP"};
32 
33 
35  : CSGObject()
36 {
37  init();
38 }
39 
40 CAlphabet::CAlphabet(char* al, int32_t len)
41 : CSGObject()
42 {
43  init();
44 
45  EAlphabet alpha=NONE;
46 
47  if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
48  alpha = DNA;
49  else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
50  alpha = RAWDNA;
51  else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
52  alpha = RNA;
53  else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
54  alpha = PROTEIN;
55  else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
56  alpha = BINARY;
57  else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
58  alpha = ALPHANUM;
59  else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
60  alpha = CUBE;
61  else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
62  alpha = DIGIT2;
63  else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
64  alpha = DIGIT;
65  else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
66  alpha = RAWDIGIT2;
67  else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
68  alpha = RAWDIGIT;
69  else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
70  alpha = SNP;
71  else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
72  alpha = RAWSNP;
73  else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
74  (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
75  alpha = RAWBYTE;
76  else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
77  alpha = IUPAC_NUCLEIC_ACID;
78  else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
79  alpha = IUPAC_AMINO_ACID;
80  else {
81  SG_ERROR( "unknown alphabet %s\n", al);
82  }
83 
84  set_alphabet(alpha);
85 }
86 
88 : CSGObject()
89 {
90  init();
91  set_alphabet(alpha);
92 }
93 
95 : CSGObject()
96 {
97  init();
98  ASSERT(a);
100  copy_histogram(a);
101 }
102 
104 {
105 }
106 
108 {
109  bool result=true;
110  alphabet=alpha;
111 
112  switch (alphabet)
113  {
114  case DNA:
115  case RAWDNA:
116  num_symbols = 4;
117  break;
118  case RNA:
119  num_symbols = 4;
120  break;
121  case PROTEIN:
122  num_symbols = 26;
123  break;
124  case BINARY:
125  num_symbols = 2;
126  break;
127  case ALPHANUM:
128  num_symbols = 36;
129  break;
130  case CUBE:
131  num_symbols = 6;
132  break;
133  case RAWBYTE:
134  num_symbols = 256;
135  break;
136  case IUPAC_NUCLEIC_ACID:
137  num_symbols = 16;
138  break;
139  case IUPAC_AMINO_ACID:
140  num_symbols = 23;
141  break;
142  case NONE:
143  num_symbols = 0;
144  break;
145  case DIGIT2:
146  num_symbols = 3;
147  break;
148  case DIGIT:
149  num_symbols = 10;
150  break;
151  case RAWDIGIT2:
152  num_symbols = 3;
153  break;
154  case RAWDIGIT:
155  num_symbols = 10;
156  break;
157  case SNP:
158  num_symbols = 5;
159  break;
160  case RAWSNP:
161  num_symbols = 5;
162  break;
163  default:
164  num_symbols = 0;
165  result=false;
166  break;
167  }
168 
169  num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
170  init_map_table();
171  clear_histogram();
172 
173  SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
174 
175  return result;
176 }
177 
179 {
180  for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
181  {
184  valid_chars[i] = false;
185  }
186 
187  switch (alphabet)
188  {
189  case RAWDIGIT:
190  for (uint8_t i=0; i<=9; i++)
191  {
192  valid_chars[i]=true;
193  maptable_to_bin[i]=i;
194  maptable_to_char[i]=i;
195  }
196  break;
197 
198  case RAWDIGIT2:
199  for (uint8_t i=0; i<=2; i++)
200  {
201  valid_chars[i]=true;
202  maptable_to_bin[i]=i;
203  maptable_to_char[i]=i;
204  }
205  break;
206 
207  case DIGIT:
208  valid_chars[(uint8_t) '0']=true;
209  valid_chars[(uint8_t) '1']=true;
210  valid_chars[(uint8_t) '2']=true;
211  valid_chars[(uint8_t) '3']=true;
212  valid_chars[(uint8_t) '4']=true;
213  valid_chars[(uint8_t) '5']=true;
214  valid_chars[(uint8_t) '6']=true;
215  valid_chars[(uint8_t) '7']=true;
216  valid_chars[(uint8_t) '8']=true;
217  valid_chars[(uint8_t) '9']=true; //Translation '0-9' -> 0-9
218 
219  maptable_to_bin[(uint8_t) '0']=0;
220  maptable_to_bin[(uint8_t) '1']=1;
221  maptable_to_bin[(uint8_t) '2']=2;
222  maptable_to_bin[(uint8_t) '3']=3;
223  maptable_to_bin[(uint8_t) '4']=4;
224  maptable_to_bin[(uint8_t) '5']=5;
225  maptable_to_bin[(uint8_t) '6']=6;
226  maptable_to_bin[(uint8_t) '7']=7;
227  maptable_to_bin[(uint8_t) '8']=8;
228  maptable_to_bin[(uint8_t) '9']=9; //Translation '0-9' -> 0-9
229 
230  maptable_to_char[(uint8_t) 0]='0';
231  maptable_to_char[(uint8_t) 1]='1';
232  maptable_to_char[(uint8_t) 2]='2';
233  maptable_to_char[(uint8_t) 3]='3';
234  maptable_to_char[(uint8_t) 4]='4';
235  maptable_to_char[(uint8_t) 5]='5';
236  maptable_to_char[(uint8_t) 6]='6';
237  maptable_to_char[(uint8_t) 7]='7';
238  maptable_to_char[(uint8_t) 8]='8';
239  maptable_to_char[(uint8_t) 9]='9'; //Translation 0-9 -> '0-9'
240  break;
241 
242  case DIGIT2:
243  valid_chars[(uint8_t) '0']=true;
244  valid_chars[(uint8_t) '1']=true;
245  valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2
246 
247  maptable_to_bin[(uint8_t) '0']=0;
248  maptable_to_bin[(uint8_t) '1']=1;
249  maptable_to_bin[(uint8_t) '2']=2; //Translation '0-2' -> 0-2
250 
251  maptable_to_char[(uint8_t) 0]='0';
252  maptable_to_char[(uint8_t) 1]='1';
253  maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2'
254  break;
255 
256  case CUBE:
257  valid_chars[(uint8_t) '1']=true;
258  valid_chars[(uint8_t) '2']=true;
259  valid_chars[(uint8_t) '3']=true;
260  valid_chars[(uint8_t) '4']=true;
261  valid_chars[(uint8_t) '5']=true;
262  valid_chars[(uint8_t) '6']=true; //Translation '123456' -> 012345
263 
264  maptable_to_bin[(uint8_t) '1']=0;
265  maptable_to_bin[(uint8_t) '2']=1;
266  maptable_to_bin[(uint8_t) '3']=2;
267  maptable_to_bin[(uint8_t) '4']=3;
268  maptable_to_bin[(uint8_t) '5']=4;
269  maptable_to_bin[(uint8_t) '6']=5; //Translation '123456' -> 012345
270 
271  maptable_to_char[(uint8_t) 0]='1';
272  maptable_to_char[(uint8_t) 1]='2';
273  maptable_to_char[(uint8_t) 2]='3';
274  maptable_to_char[(uint8_t) 3]='4';
275  maptable_to_char[(uint8_t) 4]='5';
276  maptable_to_char[(uint8_t) 5]='6'; //Translation 012345->'123456'
277  break;
278 
279  case PROTEIN:
280  {
281  int32_t skip=0 ;
282  for (int32_t i=0; i<21; i++)
283  {
284  if (i==1) skip++ ;
285  if (i==8) skip++ ;
286  if (i==12) skip++ ;
287  if (i==17) skip++ ;
288  valid_chars['A'+i+skip]=true;
289  maptable_to_bin['A'+i+skip]=i ;
290  maptable_to_char[i]='A'+i+skip ;
291  } ; //Translation 012345->acde...xy -- the protein code
292  } ;
293  break;
294 
295  case BINARY:
296  valid_chars[(uint8_t) '0']=true;
297  valid_chars[(uint8_t) '1']=true;
298 
299  maptable_to_bin[(uint8_t) '0']=0;
300  maptable_to_bin[(uint8_t) '1']=1;
301 
302  maptable_to_char[0]=(uint8_t) '0';
303  maptable_to_char[1]=(uint8_t) '1';
304  break;
305 
306  case ALPHANUM:
307  {
308  for (int32_t i=0; i<26; i++)
309  {
310  valid_chars[(uint8_t) 'A'+i]=true;
311  maptable_to_bin[(uint8_t) 'A'+i]=i ;
312  maptable_to_char[i]='A'+i ;
313  } ;
314  for (int32_t i=0; i<10; i++)
315  {
316  valid_chars[(uint8_t) '0'+i]=true;
317  maptable_to_bin[(uint8_t) '0'+i]=26+i ;
318  maptable_to_char[26+i]='0'+i ;
319  } ; //Translation 012345->acde...xy0123456789
320  } ;
321  break;
322 
323  case RAWBYTE:
324  {
325  //identity
326  for (int32_t i=0; i<256; i++)
327  {
328  valid_chars[i]=true;
329  maptable_to_bin[i]=i;
330  maptable_to_char[i]=i;
331  }
332  }
333  break;
334 
335  case DNA:
336  valid_chars[(uint8_t) 'A']=true;
337  valid_chars[(uint8_t) 'C']=true;
338  valid_chars[(uint8_t) 'G']=true;
339  valid_chars[(uint8_t) 'T']=true;
340 
341  maptable_to_bin[(uint8_t) 'A']=B_A;
342  maptable_to_bin[(uint8_t) 'C']=B_C;
343  maptable_to_bin[(uint8_t) 'G']=B_G;
344  maptable_to_bin[(uint8_t) 'T']=B_T;
345 
346  maptable_to_char[B_A]='A';
347  maptable_to_char[B_C]='C';
348  maptable_to_char[B_G]='G';
349  maptable_to_char[B_T]='T';
350  break;
351  case RAWDNA:
352  {
353  //identity
354  for (int32_t i=0; i<4; i++)
355  {
356  valid_chars[i]=true;
357  maptable_to_bin[i]=i;
358  maptable_to_char[i]=i;
359  }
360  }
361  break;
362 
363  case SNP:
364  valid_chars[(uint8_t) 'A']=true;
365  valid_chars[(uint8_t) 'C']=true;
366  valid_chars[(uint8_t) 'G']=true;
367  valid_chars[(uint8_t) 'T']=true;
368  valid_chars[(uint8_t) '0']=true;
369 
370  maptable_to_bin[(uint8_t) 'A']=B_A;
371  maptable_to_bin[(uint8_t) 'C']=B_C;
372  maptable_to_bin[(uint8_t) 'G']=B_G;
373  maptable_to_bin[(uint8_t) 'T']=B_T;
374  maptable_to_bin[(uint8_t) '0']=B_0;
375 
376  maptable_to_char[B_A]='A';
377  maptable_to_char[B_C]='C';
378  maptable_to_char[B_G]='G';
379  maptable_to_char[B_T]='T';
380  maptable_to_char[B_0]='0';
381  break;
382  case RAWSNP:
383  {
384  //identity
385  for (int32_t i=0; i<5; i++)
386  {
387  valid_chars[i]=true;
388  maptable_to_bin[i]=i;
389  maptable_to_char[i]=i;
390  }
391  }
392  break;
393 
394  case RNA:
395  valid_chars[(uint8_t) 'A']=true;
396  valid_chars[(uint8_t) 'C']=true;
397  valid_chars[(uint8_t) 'G']=true;
398  valid_chars[(uint8_t) 'U']=true;
399 
400  maptable_to_bin[(uint8_t) 'A']=B_A;
401  maptable_to_bin[(uint8_t) 'C']=B_C;
402  maptable_to_bin[(uint8_t) 'G']=B_G;
403  maptable_to_bin[(uint8_t) 'U']=B_T;
404 
405  maptable_to_char[B_A]='A';
406  maptable_to_char[B_C]='C';
407  maptable_to_char[B_G]='G';
408  maptable_to_char[B_T]='U';
409  break;
410 
411  case IUPAC_NUCLEIC_ACID:
412  valid_chars[(uint8_t) 'A']=true; // A Adenine
413  valid_chars[(uint8_t) 'C']=true; // C Cytosine
414  valid_chars[(uint8_t) 'G']=true; // G Guanine
415  valid_chars[(uint8_t) 'T']=true; // T Thymine
416  valid_chars[(uint8_t) 'U']=true; // U Uracil
417  valid_chars[(uint8_t) 'R']=true; // R Purine (A or G)
418  valid_chars[(uint8_t) 'Y']=true; // Y Pyrimidine (C, T, or U)
419  valid_chars[(uint8_t) 'M']=true; // M C or A
420  valid_chars[(uint8_t) 'K']=true; // K T, U, or G
421  valid_chars[(uint8_t) 'W']=true; // W T, U, or A
422  valid_chars[(uint8_t) 'S']=true; // S C or G
423  valid_chars[(uint8_t) 'B']=true; // B C, T, U, or G (not A)
424  valid_chars[(uint8_t) 'D']=true; // D A, T, U, or G (not C)
425  valid_chars[(uint8_t) 'H']=true; // H A, T, U, or C (not G)
426  valid_chars[(uint8_t) 'V']=true; // V A, C, or G (not T, not U)
427  valid_chars[(uint8_t) 'N']=true; // N Any base (A, C, G, T, or U)
428 
429  maptable_to_bin[(uint8_t) 'A']=0; // A Adenine
430  maptable_to_bin[(uint8_t) 'C']=1; // C Cytosine
431  maptable_to_bin[(uint8_t) 'G']=2; // G Guanine
432  maptable_to_bin[(uint8_t) 'T']=3; // T Thymine
433  maptable_to_bin[(uint8_t) 'U']=4; // U Uracil
434  maptable_to_bin[(uint8_t) 'R']=5; // R Purine (A or G)
435  maptable_to_bin[(uint8_t) 'Y']=6; // Y Pyrimidine (C, T, or U)
436  maptable_to_bin[(uint8_t) 'M']=7; // M C or A
437  maptable_to_bin[(uint8_t) 'K']=8; // K T, U, or G
438  maptable_to_bin[(uint8_t) 'W']=9; // W T, U, or A
439  maptable_to_bin[(uint8_t) 'S']=10; // S C or G
440  maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
441  maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
442  maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
443  maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
444  maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
445 
446  maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
447  maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
448  maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
449  maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
450  maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
451  maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
452  maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
453  maptable_to_char[7]=(uint8_t) 'M'; // M C or A
454  maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
455  maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
456  maptable_to_char[10]=(uint8_t) 'S'; // S C or G
457  maptable_to_char[11]=(uint8_t) 'B'; // B C, T, U, or G (not A)
458  maptable_to_char[12]=(uint8_t) 'D'; // D A, T, U, or G (not C)
459  maptable_to_char[13]=(uint8_t) 'H'; // H A, T, U, or C (not G)
460  maptable_to_char[14]=(uint8_t) 'V'; // V A, C, or G (not T, not U)
461  maptable_to_char[15]=(uint8_t) 'N'; // N Any base (A, C, G, T, or U)
462  break;
463 
464  case IUPAC_AMINO_ACID:
465  valid_chars[(uint8_t) 'A']=true; //A Ala Alanine
466  valid_chars[(uint8_t) 'R']=true; //R Arg Arginine
467  valid_chars[(uint8_t) 'N']=true; //N Asn Asparagine
468  valid_chars[(uint8_t) 'D']=true; //D Asp Aspartic acid
469  valid_chars[(uint8_t) 'C']=true; //C Cys Cysteine
470  valid_chars[(uint8_t) 'Q']=true; //Q Gln Glutamine
471  valid_chars[(uint8_t) 'E']=true; //E Glu Glutamic acid
472  valid_chars[(uint8_t) 'G']=true; //G Gly Glycine
473  valid_chars[(uint8_t) 'H']=true; //H His Histidine
474  valid_chars[(uint8_t) 'I']=true; //I Ile Isoleucine
475  valid_chars[(uint8_t) 'L']=true; //L Leu Leucine
476  valid_chars[(uint8_t) 'K']=true; //K Lys Lysine
477  valid_chars[(uint8_t) 'M']=true; //M Met Methionine
478  valid_chars[(uint8_t) 'F']=true; //F Phe Phenylalanine
479  valid_chars[(uint8_t) 'P']=true; //P Pro Proline
480  valid_chars[(uint8_t) 'S']=true; //S Ser Serine
481  valid_chars[(uint8_t) 'T']=true; //T Thr Threonine
482  valid_chars[(uint8_t) 'W']=true; //W Trp Tryptophan
483  valid_chars[(uint8_t) 'Y']=true; //Y Tyr Tyrosine
484  valid_chars[(uint8_t) 'V']=true; //V Val Valine
485  valid_chars[(uint8_t) 'B']=true; //B Asx Aspartic acid or Asparagine
486  valid_chars[(uint8_t) 'Z']=true; //Z Glx Glutamine or Glutamic acid
487  valid_chars[(uint8_t) 'X']=true; //X Xaa Any amino acid
488 
489  maptable_to_bin[(uint8_t) 'A']=0; //A Ala Alanine
490  maptable_to_bin[(uint8_t) 'R']=1; //R Arg Arginine
491  maptable_to_bin[(uint8_t) 'N']=2; //N Asn Asparagine
492  maptable_to_bin[(uint8_t) 'D']=3; //D Asp Aspartic acid
493  maptable_to_bin[(uint8_t) 'C']=4; //C Cys Cysteine
494  maptable_to_bin[(uint8_t) 'Q']=5; //Q Gln Glutamine
495  maptable_to_bin[(uint8_t) 'E']=6; //E Glu Glutamic acid
496  maptable_to_bin[(uint8_t) 'G']=7; //G Gly Glycine
497  maptable_to_bin[(uint8_t) 'H']=8; //H His Histidine
498  maptable_to_bin[(uint8_t) 'I']=9; //I Ile Isoleucine
499  maptable_to_bin[(uint8_t) 'L']=10; //L Leu Leucine
500  maptable_to_bin[(uint8_t) 'K']=11; //K Lys Lysine
501  maptable_to_bin[(uint8_t) 'M']=12; //M Met Methionine
502  maptable_to_bin[(uint8_t) 'F']=13; //F Phe Phenylalanine
503  maptable_to_bin[(uint8_t) 'P']=14; //P Pro Proline
504  maptable_to_bin[(uint8_t) 'S']=15; //S Ser Serine
505  maptable_to_bin[(uint8_t) 'T']=16; //T Thr Threonine
506  maptable_to_bin[(uint8_t) 'W']=17; //W Trp Tryptophan
507  maptable_to_bin[(uint8_t) 'Y']=18; //Y Tyr Tyrosine
508  maptable_to_bin[(uint8_t) 'V']=19; //V Val Valine
509  maptable_to_bin[(uint8_t) 'B']=20; //B Asx Aspartic acid or Asparagine
510  maptable_to_bin[(uint8_t) 'Z']=21; //Z Glx Glutamine or Glutamic acid
511  maptable_to_bin[(uint8_t) 'X']=22; //X Xaa Any amino acid
512 
513  maptable_to_char[0]=(uint8_t) 'A'; //A Ala Alanine
514  maptable_to_char[1]=(uint8_t) 'R'; //R Arg Arginine
515  maptable_to_char[2]=(uint8_t) 'N'; //N Asn Asparagine
516  maptable_to_char[3]=(uint8_t) 'D'; //D Asp Aspartic acid
517  maptable_to_char[4]=(uint8_t) 'C'; //C Cys Cysteine
518  maptable_to_char[5]=(uint8_t) 'Q'; //Q Gln Glutamine
519  maptable_to_char[6]=(uint8_t) 'E'; //E Glu Glutamic acid
520  maptable_to_char[7]=(uint8_t) 'G'; //G Gly Glycine
521  maptable_to_char[8]=(uint8_t) 'H'; //H His Histidine
522  maptable_to_char[9]=(uint8_t) 'I'; //I Ile Isoleucine
523  maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
524  maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
525  maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
526  maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
527  maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
528  maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
529  maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
530  maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
531  maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
532  maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
533  maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
534  maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
535  maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
536  break;
537 
538  default:
539  break; //leave uninitialised
540  };
541 }
542 
544 {
545  memset(histogram, 0, sizeof(histogram));
546  print_histogram();
547 }
548 
550 {
551  int32_t max_sym=-1;
552  for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
553  {
554  if (histogram[i])
555  {
556  max_sym=i;
557  break;
558  }
559  }
560 
561  return max_sym;
562 }
563 
565 {
566  int32_t num_sym=0;
567  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
568  {
569  if (histogram[i])
570  num_sym++;
571  }
572 
573  return num_sym;
574 }
575 
577 {
578  int32_t num_sym=get_num_symbols_in_histogram();
579  if (num_sym>0)
580  return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
581  else
582  return 0;
583 }
584 
585 
587 {
588  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
589  {
590  if (histogram[i])
591  SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
592  }
593 }
594 
596 {
597  return SGVector<int64_t>(&histogram[0], 1 << (sizeof(uint8_t)*8), false);
598 }
599 
600 bool CAlphabet::check_alphabet(bool print_error)
601 {
602  bool result = true;
603 
604  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
605  {
606  if (histogram[i]>0 && valid_chars[i]==0)
607  {
608  result=false;
609  break;
610  }
611  }
612 
613  if (!result && print_error)
614  {
615  print_histogram();
616  SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
617  }
618 
619  return result;
620 }
621 
622 bool CAlphabet::check_alphabet_size(bool print_error)
623 {
625  {
626  if (print_error)
627  {
628  print_histogram();
629  fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
630  SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
631  }
632  return false;
633  }
634  else
635  return true;
636 
637 }
638 
640 {
642 
643  if (h.vlen != sizeof(histogram)/sizeof(histogram[0]))
644  {
645  SG_ERROR("Histogram has %d elements, but %d elements where expected\n",
646  h.vlen, sizeof(histogram)/sizeof(histogram[0]));
647  }
648 
649  memcpy(histogram, h.vector, sizeof(histogram));
650 }
651 
653 {
654 
655  int32_t idx;
656  switch (alphabet)
657  {
658  case DNA:
659  idx=0;
660  break;
661  case RAWDNA:
662  idx=1;
663  break;
664  case RNA:
665  idx=2;
666  break;
667  case PROTEIN:
668  idx=3;
669  break;
670  case BINARY:
671  idx=4;
672  break;
673  case ALPHANUM:
674  idx=5;
675  break;
676  case CUBE:
677  idx=6;
678  break;
679  case RAWBYTE:
680  idx=7;
681  break;
682  case IUPAC_NUCLEIC_ACID:
683  idx=8;
684  break;
685  case IUPAC_AMINO_ACID:
686  idx=9;
687  break;
688  case NONE:
689  idx=10;
690  break;
691  case DIGIT:
692  idx=11;
693  break;
694  case DIGIT2:
695  idx=12;
696  break;
697  default:
698  idx=13;
699  break;
700  }
701  return alphabet_names[idx];
702 }
703 
704 void CAlphabet::init()
705 {
706  alphabet = NONE;
707  num_symbols = 0;
708  num_bits = 0;
709 
710  memset(valid_chars, 0, sizeof (valid_chars));
711  memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
712  memset(maptable_to_char, 0, sizeof (maptable_to_char));
713  memset(histogram, 0, sizeof (histogram));
714 
715 
716  m_parameters->add((machine_int_t*) &alphabet, "alphabet",
717  "Alphabet enum.");
718  m_parameters->add(&num_symbols, "num_symbols",
719  "Number of symbols.");
720  m_parameters->add(&num_bits, "num_bits",
721  "Number of bits.");
722 
723  /* We don't need to serialize the mapping tables / they can be computed
724  * after de-serializing. Lets not serialize the histogram for now. Doesn't
725  * really make sense. */
726 
727  /* m_parameters->add_histogram(&histogram, sizeof(histogram),
728  "histogram",
729  "Histogram."); */
730 }
731 
733 {
735 
736  init_map_table();
737 }
738 
739 
740 namespace shogun
741 {
742 template <class ST>
743 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
744 {
745  int32_t i,j;
746  ST value=0;
747 
748  for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
749  {
750  value=0;
751  for (j=i; j>=i-p_order+1; j--)
752  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
753 
754  obs[i]= (ST) value;
755  }
756 
757  for (i=p_order-2;i>=0;i--)
758  {
759  if (i>=sequence_length)
760  continue;
761 
762  value=0;
763  for (j=i; j>=i-p_order+1; j--)
764  {
765  value= (value >> max_val);
766  if (j>=0 && j<sequence_length)
767  value|=obs[j] << (max_val * (p_order-1));
768  }
769  obs[i]=value;
770  }
771 
772  // TODO we should get rid of this loop!
773  if (start>0)
774  {
775  for (i=start; i<sequence_length; i++)
776  obs[i-start]=obs[i];
777  }
778 }
779 
780 template <class ST>
781 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
782 {
783  int32_t i,j;
784  ST value=0;
785 
786  for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
787  {
788  value=0;
789  for (j=i; j>=i-p_order+1; j--)
790  value= (value << max_val) | obs[j];
791 
792  obs[i]= (ST) value;
793  }
794 
795  for (i=p_order-2;i>=0;i--)
796  {
797  if (i>=sequence_length)
798  continue;
799 
800  value=0;
801  for (j=i; j>=i-p_order+1; j--)
802  {
803  value= (value << max_val);
804  if (j>=0 && j<sequence_length)
805  value|=obs[j];
806  }
807  obs[i]=value;
808  }
809 
810  // TODO we should get rid of this loop!
811  if (start>0)
812  {
813  for (i=start; i<sequence_length; i++)
814  obs[i-start]=obs[i];
815  }
816 }
817 
818 template <class ST>
819 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
820 {
821  ASSERT(gap>=0);
822 
823  const int32_t start_gap=(p_order-gap)/2;
824  const int32_t end_gap=start_gap+gap;
825 
826  int32_t i,j;
827  ST value=0;
828 
829  // almost all positions
830  for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
831  {
832  value=0;
833  for (j=i; j>=i-p_order+1; j--)
834  {
835  if (i-j<start_gap)
836  {
837  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
838  }
839  else if (i-j>=end_gap)
840  {
841  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
842  }
843  }
844  obs[i]= (ST) value;
845  }
846 
847  // the remaining `order` positions
848  for (i=p_order-2;i>=0;i--)
849  {
850  if (i>=sequence_length)
851  continue;
852 
853  value=0;
854  for (j=i; j>=i-p_order+1; j--)
855  {
856  if (i-j<start_gap)
857  {
858  value= (value >> max_val);
859  if (j>=0 && j<sequence_length)
860  value|=obs[j] << (max_val * (p_order-1-gap));
861  }
862  else if (i-j>=end_gap)
863  {
864  value= (value >> max_val);
865  if (j>=0 && j<sequence_length)
866  value|=obs[j] << (max_val * (p_order-1-gap));
867  }
868  }
869  obs[i]=value;
870  }
871 
872  // TODO we should get rid of this loop!
873  if (start>0)
874  {
875  for (i=start; i<sequence_length; i++)
876  obs[i-start]=obs[i];
877  }
878 }
879 
880 template <class ST>
881 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
882 {
883  ASSERT(gap>=0);
884 
885  const int32_t start_gap=(p_order-gap)/2;
886  const int32_t end_gap=start_gap+gap;
887 
888  int32_t i,j;
889  ST value=0;
890 
891  // almost all positions
892  for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
893  {
894  value=0;
895  for (j=i; j>=i-p_order+1; j--)
896  {
897  if (i-j<start_gap)
898  value= (value << max_val) | obs[j];
899  else if (i-j>=end_gap)
900  value= (value << max_val) | obs[j];
901  }
902  obs[i]= (ST) value;
903  }
904 
905  // the remaining `order` positions
906  for (i=p_order-2;i>=0;i--)
907  {
908  if (i>=sequence_length)
909  continue;
910 
911  value=0;
912  for (j=i; j>=i-p_order+1; j--)
913  {
914  if (i-j<start_gap)
915  {
916  value= value << max_val;
917  if (j>=0 && j<sequence_length)
918  value|=obs[j];
919  }
920  else if (i-j>=end_gap)
921  {
922  value= value << max_val;
923  if (j>=0 && j<sequence_length)
924  value|=obs[j];
925  }
926  }
927  obs[i]=value;
928  }
929 
930  // TODO we should get rid of this loop!
931  if (start>0)
932  {
933  for (i=start; i<sequence_length; i++)
934  obs[i-start]=obs[i];
935  }
936 }
937 
938 template<> void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
939 {
940 }
941 
942 template<> void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
943 {
944 }
945 
946 template<> void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
947 {
948 }
949 
950 template<> void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
951 {
952 }
953 
954 template<> void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
955 {
956 }
957 
958 template<> void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
959 {
960 }
961 
962 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
963 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
964 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
965 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
966 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
967 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
968 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
969 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
970 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
971 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
972 
973 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
974 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
975 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
976 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
977 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
978 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
979 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
980 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
981 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
982 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
983 
984 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
985 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
986 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
987 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
988 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
989 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
990 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
991 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
992 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
993 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
994 
995 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
996 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
997 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
998 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
999 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1000 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1001 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1002 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1003 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1004 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1005 template void CAlphabet::translate_from_single_order_reversed<float32_t>(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1006 template void CAlphabet::translate_from_single_order_reversed<float64_t>(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1007 template void CAlphabet::translate_from_single_order_reversed<floatmax_t>(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1008 }

SHOGUN Machine Learning Toolbox - Documentation