SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Alphabet.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <string.h>
12 #include <math.h>
13 #include <ctype.h>
14 
16 #include <shogun/io/SGIO.h>
17 #include <shogun/base/Parameter.h>
18 
19 using namespace shogun;
20 
21 //define numbers for the bases
22 const uint8_t CAlphabet::B_A=0;
23 const uint8_t CAlphabet::B_C=1;
24 const uint8_t CAlphabet::B_G=2;
25 const uint8_t CAlphabet::B_T=3;
26 const uint8_t CAlphabet::B_0=4;
27 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
28 const char* CAlphabet::alphabet_names[18]={
29  "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
30  "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
31  "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
32  "SNP", "RAWSNP"};
33 
34 
36  : CSGObject()
37 {
38  init();
39 }
40 
41 CAlphabet::CAlphabet(char* al, int32_t len)
42 : CSGObject()
43 {
44  init();
45 
46  EAlphabet alpha=NONE;
47 
48  if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
49  alpha = DNA;
50  else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
51  alpha = RAWDNA;
52  else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
53  alpha = RNA;
54  else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
55  alpha = PROTEIN;
56  else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
57  alpha = BINARY;
58  else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
59  alpha = ALPHANUM;
60  else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
61  alpha = CUBE;
62  else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
63  alpha = DIGIT2;
64  else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
65  alpha = DIGIT;
66  else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
67  alpha = RAWDIGIT2;
68  else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
69  alpha = RAWDIGIT;
70  else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
71  alpha = SNP;
72  else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
73  alpha = RAWSNP;
74  else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
75  (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
76  alpha = RAWBYTE;
77  else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
78  alpha = IUPAC_NUCLEIC_ACID;
79  else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
80  alpha = IUPAC_AMINO_ACID;
81  else {
82  SG_ERROR("unknown alphabet %s\n", al)
83  }
84 
85  set_alphabet(alpha);
86 }
87 
89 : CSGObject()
90 {
91  init();
92  set_alphabet(alpha);
93 }
94 
96 : CSGObject()
97 {
98  init();
99  ASSERT(a)
101  copy_histogram(a);
102 }
103 
105 {
106 }
107 
109 {
110  bool result=true;
111  alphabet=alpha;
112 
113  switch (alphabet)
114  {
115  case DNA:
116  case RAWDNA:
117  num_symbols = 4;
118  break;
119  case RNA:
120  num_symbols = 4;
121  break;
122  case PROTEIN:
123  num_symbols = 26;
124  break;
125  case BINARY:
126  num_symbols = 2;
127  break;
128  case ALPHANUM:
129  num_symbols = 36;
130  break;
131  case CUBE:
132  num_symbols = 6;
133  break;
134  case RAWBYTE:
135  num_symbols = 256;
136  break;
137  case IUPAC_NUCLEIC_ACID:
138  num_symbols = 16;
139  break;
140  case IUPAC_AMINO_ACID:
141  num_symbols = 23;
142  break;
143  case NONE:
144  num_symbols = 0;
145  break;
146  case DIGIT2:
147  num_symbols = 3;
148  break;
149  case DIGIT:
150  num_symbols = 10;
151  break;
152  case RAWDIGIT2:
153  num_symbols = 3;
154  break;
155  case RAWDIGIT:
156  num_symbols = 10;
157  break;
158  case SNP:
159  num_symbols = 5;
160  break;
161  case RAWSNP:
162  num_symbols = 5;
163  break;
164  default:
165  num_symbols = 0;
166  result=false;
167  break;
168  }
169 
170  num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
171  init_map_table();
172  clear_histogram();
173 
174  SG_DEBUG("initialised alphabet %s\n", get_alphabet_name(alphabet))
175 
176  return result;
177 }
178 
180 {
181  for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
182  {
185  valid_chars[i] = false;
186  }
187 
188  switch (alphabet)
189  {
190  case RAWDIGIT:
191  for (uint8_t i=0; i<=9; i++)
192  {
193  valid_chars[i]=true;
194  maptable_to_bin[i]=i;
195  maptable_to_char[i]=i;
196  }
197  break;
198 
199  case RAWDIGIT2:
200  for (uint8_t i=0; i<=2; i++)
201  {
202  valid_chars[i]=true;
203  maptable_to_bin[i]=i;
204  maptable_to_char[i]=i;
205  }
206  break;
207 
208  case DIGIT:
209  valid_chars[(uint8_t) '0']=true;
210  valid_chars[(uint8_t) '1']=true;
211  valid_chars[(uint8_t) '2']=true;
212  valid_chars[(uint8_t) '3']=true;
213  valid_chars[(uint8_t) '4']=true;
214  valid_chars[(uint8_t) '5']=true;
215  valid_chars[(uint8_t) '6']=true;
216  valid_chars[(uint8_t) '7']=true;
217  valid_chars[(uint8_t) '8']=true;
218  valid_chars[(uint8_t) '9']=true; //Translation '0-9' -> 0-9
219 
220  maptable_to_bin[(uint8_t) '0']=0;
221  maptable_to_bin[(uint8_t) '1']=1;
222  maptable_to_bin[(uint8_t) '2']=2;
223  maptable_to_bin[(uint8_t) '3']=3;
224  maptable_to_bin[(uint8_t) '4']=4;
225  maptable_to_bin[(uint8_t) '5']=5;
226  maptable_to_bin[(uint8_t) '6']=6;
227  maptable_to_bin[(uint8_t) '7']=7;
228  maptable_to_bin[(uint8_t) '8']=8;
229  maptable_to_bin[(uint8_t) '9']=9; //Translation '0-9' -> 0-9
230 
231  maptable_to_char[(uint8_t) 0]='0';
232  maptable_to_char[(uint8_t) 1]='1';
233  maptable_to_char[(uint8_t) 2]='2';
234  maptable_to_char[(uint8_t) 3]='3';
235  maptable_to_char[(uint8_t) 4]='4';
236  maptable_to_char[(uint8_t) 5]='5';
237  maptable_to_char[(uint8_t) 6]='6';
238  maptable_to_char[(uint8_t) 7]='7';
239  maptable_to_char[(uint8_t) 8]='8';
240  maptable_to_char[(uint8_t) 9]='9'; //Translation 0-9 -> '0-9'
241  break;
242 
243  case DIGIT2:
244  valid_chars[(uint8_t) '0']=true;
245  valid_chars[(uint8_t) '1']=true;
246  valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2
247 
248  maptable_to_bin[(uint8_t) '0']=0;
249  maptable_to_bin[(uint8_t) '1']=1;
250  maptable_to_bin[(uint8_t) '2']=2; //Translation '0-2' -> 0-2
251 
252  maptable_to_char[(uint8_t) 0]='0';
253  maptable_to_char[(uint8_t) 1]='1';
254  maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2'
255  break;
256 
257  case CUBE:
258  valid_chars[(uint8_t) '1']=true;
259  valid_chars[(uint8_t) '2']=true;
260  valid_chars[(uint8_t) '3']=true;
261  valid_chars[(uint8_t) '4']=true;
262  valid_chars[(uint8_t) '5']=true;
263  valid_chars[(uint8_t) '6']=true; //Translation '123456' -> 012345
264 
265  maptable_to_bin[(uint8_t) '1']=0;
266  maptable_to_bin[(uint8_t) '2']=1;
267  maptable_to_bin[(uint8_t) '3']=2;
268  maptable_to_bin[(uint8_t) '4']=3;
269  maptable_to_bin[(uint8_t) '5']=4;
270  maptable_to_bin[(uint8_t) '6']=5; //Translation '123456' -> 012345
271 
272  maptable_to_char[(uint8_t) 0]='1';
273  maptable_to_char[(uint8_t) 1]='2';
274  maptable_to_char[(uint8_t) 2]='3';
275  maptable_to_char[(uint8_t) 3]='4';
276  maptable_to_char[(uint8_t) 4]='5';
277  maptable_to_char[(uint8_t) 5]='6'; //Translation 012345->'123456'
278  break;
279 
280  case PROTEIN:
281  {
282  int32_t skip=0 ;
283  for (int32_t i=0; i<21; i++)
284  {
285  if (i==1) skip++ ;
286  if (i==8) skip++ ;
287  if (i==12) skip++ ;
288  if (i==17) skip++ ;
289  valid_chars['A'+i+skip]=true;
290  maptable_to_bin['A'+i+skip]=i ;
291  maptable_to_char[i]='A'+i+skip ;
292  } ; //Translation 012345->acde...xy -- the protein code
293  } ;
294  break;
295 
296  case BINARY:
297  valid_chars[(uint8_t) '0']=true;
298  valid_chars[(uint8_t) '1']=true;
299 
300  maptable_to_bin[(uint8_t) '0']=0;
301  maptable_to_bin[(uint8_t) '1']=1;
302 
303  maptable_to_char[0]=(uint8_t) '0';
304  maptable_to_char[1]=(uint8_t) '1';
305  break;
306 
307  case ALPHANUM:
308  {
309  for (int32_t i=0; i<26; i++)
310  {
311  valid_chars[(uint8_t) 'A'+i]=true;
312  maptable_to_bin[(uint8_t) 'A'+i]=i ;
313  maptable_to_char[i]='A'+i ;
314  } ;
315  for (int32_t i=0; i<10; i++)
316  {
317  valid_chars[(uint8_t) '0'+i]=true;
318  maptable_to_bin[(uint8_t) '0'+i]=26+i ;
319  maptable_to_char[26+i]='0'+i ;
320  } ; //Translation 012345->acde...xy0123456789
321  } ;
322  break;
323 
324  case RAWBYTE:
325  {
326  //identity
327  for (int32_t i=0; i<256; i++)
328  {
329  valid_chars[i]=true;
330  maptable_to_bin[i]=i;
331  maptable_to_char[i]=i;
332  }
333  }
334  break;
335 
336  case DNA:
337  valid_chars[(uint8_t) 'A']=true;
338  valid_chars[(uint8_t) 'C']=true;
339  valid_chars[(uint8_t) 'G']=true;
340  valid_chars[(uint8_t) 'T']=true;
341 
342  maptable_to_bin[(uint8_t) 'A']=B_A;
343  maptable_to_bin[(uint8_t) 'C']=B_C;
344  maptable_to_bin[(uint8_t) 'G']=B_G;
345  maptable_to_bin[(uint8_t) 'T']=B_T;
346 
347  maptable_to_char[B_A]='A';
348  maptable_to_char[B_C]='C';
349  maptable_to_char[B_G]='G';
350  maptable_to_char[B_T]='T';
351  break;
352  case RAWDNA:
353  {
354  //identity
355  for (int32_t i=0; i<4; i++)
356  {
357  valid_chars[i]=true;
358  maptable_to_bin[i]=i;
359  maptable_to_char[i]=i;
360  }
361  }
362  break;
363 
364  case SNP:
365  valid_chars[(uint8_t) 'A']=true;
366  valid_chars[(uint8_t) 'C']=true;
367  valid_chars[(uint8_t) 'G']=true;
368  valid_chars[(uint8_t) 'T']=true;
369  valid_chars[(uint8_t) '0']=true;
370 
371  maptable_to_bin[(uint8_t) 'A']=B_A;
372  maptable_to_bin[(uint8_t) 'C']=B_C;
373  maptable_to_bin[(uint8_t) 'G']=B_G;
374  maptable_to_bin[(uint8_t) 'T']=B_T;
375  maptable_to_bin[(uint8_t) '0']=B_0;
376 
377  maptable_to_char[B_A]='A';
378  maptable_to_char[B_C]='C';
379  maptable_to_char[B_G]='G';
380  maptable_to_char[B_T]='T';
381  maptable_to_char[B_0]='0';
382  break;
383  case RAWSNP:
384  {
385  //identity
386  for (int32_t i=0; i<5; i++)
387  {
388  valid_chars[i]=true;
389  maptable_to_bin[i]=i;
390  maptable_to_char[i]=i;
391  }
392  }
393  break;
394 
395  case RNA:
396  valid_chars[(uint8_t) 'A']=true;
397  valid_chars[(uint8_t) 'C']=true;
398  valid_chars[(uint8_t) 'G']=true;
399  valid_chars[(uint8_t) 'U']=true;
400 
401  maptable_to_bin[(uint8_t) 'A']=B_A;
402  maptable_to_bin[(uint8_t) 'C']=B_C;
403  maptable_to_bin[(uint8_t) 'G']=B_G;
404  maptable_to_bin[(uint8_t) 'U']=B_T;
405 
406  maptable_to_char[B_A]='A';
407  maptable_to_char[B_C]='C';
408  maptable_to_char[B_G]='G';
409  maptable_to_char[B_T]='U';
410  break;
411 
412  case IUPAC_NUCLEIC_ACID:
413  valid_chars[(uint8_t) 'A']=true; // A Adenine
414  valid_chars[(uint8_t) 'C']=true; // C Cytosine
415  valid_chars[(uint8_t) 'G']=true; // G Guanine
416  valid_chars[(uint8_t) 'T']=true; // T Thymine
417  valid_chars[(uint8_t) 'U']=true; // U Uracil
418  valid_chars[(uint8_t) 'R']=true; // R Purine (A or G)
419  valid_chars[(uint8_t) 'Y']=true; // Y Pyrimidine (C, T, or U)
420  valid_chars[(uint8_t) 'M']=true; // M C or A
421  valid_chars[(uint8_t) 'K']=true; // K T, U, or G
422  valid_chars[(uint8_t) 'W']=true; // W T, U, or A
423  valid_chars[(uint8_t) 'S']=true; // S C or G
424  valid_chars[(uint8_t) 'B']=true; // B C, T, U, or G (not A)
425  valid_chars[(uint8_t) 'D']=true; // D A, T, U, or G (not C)
426  valid_chars[(uint8_t) 'H']=true; // H A, T, U, or C (not G)
427  valid_chars[(uint8_t) 'V']=true; // V A, C, or G (not T, not U)
428  valid_chars[(uint8_t) 'N']=true; // N Any base (A, C, G, T, or U)
429 
430  maptable_to_bin[(uint8_t) 'A']=0; // A Adenine
431  maptable_to_bin[(uint8_t) 'C']=1; // C Cytosine
432  maptable_to_bin[(uint8_t) 'G']=2; // G Guanine
433  maptable_to_bin[(uint8_t) 'T']=3; // T Thymine
434  maptable_to_bin[(uint8_t) 'U']=4; // U Uracil
435  maptable_to_bin[(uint8_t) 'R']=5; // R Purine (A or G)
436  maptable_to_bin[(uint8_t) 'Y']=6; // Y Pyrimidine (C, T, or U)
437  maptable_to_bin[(uint8_t) 'M']=7; // M C or A
438  maptable_to_bin[(uint8_t) 'K']=8; // K T, U, or G
439  maptable_to_bin[(uint8_t) 'W']=9; // W T, U, or A
440  maptable_to_bin[(uint8_t) 'S']=10; // S C or G
441  maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
442  maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
443  maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
444  maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
445  maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
446 
447  maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
448  maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
449  maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
450  maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
451  maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
452  maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
453  maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
454  maptable_to_char[7]=(uint8_t) 'M'; // M C or A
455  maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
456  maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
457  maptable_to_char[10]=(uint8_t) 'S'; // S C or G
458  maptable_to_char[11]=(uint8_t) 'B'; // B C, T, U, or G (not A)
459  maptable_to_char[12]=(uint8_t) 'D'; // D A, T, U, or G (not C)
460  maptable_to_char[13]=(uint8_t) 'H'; // H A, T, U, or C (not G)
461  maptable_to_char[14]=(uint8_t) 'V'; // V A, C, or G (not T, not U)
462  maptable_to_char[15]=(uint8_t) 'N'; // N Any base (A, C, G, T, or U)
463  break;
464 
465  case IUPAC_AMINO_ACID:
466  valid_chars[(uint8_t) 'A']=true; //A Ala Alanine
467  valid_chars[(uint8_t) 'R']=true; //R Arg Arginine
468  valid_chars[(uint8_t) 'N']=true; //N Asn Asparagine
469  valid_chars[(uint8_t) 'D']=true; //D Asp Aspartic acid
470  valid_chars[(uint8_t) 'C']=true; //C Cys Cysteine
471  valid_chars[(uint8_t) 'Q']=true; //Q Gln Glutamine
472  valid_chars[(uint8_t) 'E']=true; //E Glu Glutamic acid
473  valid_chars[(uint8_t) 'G']=true; //G Gly Glycine
474  valid_chars[(uint8_t) 'H']=true; //H His Histidine
475  valid_chars[(uint8_t) 'I']=true; //I Ile Isoleucine
476  valid_chars[(uint8_t) 'L']=true; //L Leu Leucine
477  valid_chars[(uint8_t) 'K']=true; //K Lys Lysine
478  valid_chars[(uint8_t) 'M']=true; //M Met Methionine
479  valid_chars[(uint8_t) 'F']=true; //F Phe Phenylalanine
480  valid_chars[(uint8_t) 'P']=true; //P Pro Proline
481  valid_chars[(uint8_t) 'S']=true; //S Ser Serine
482  valid_chars[(uint8_t) 'T']=true; //T Thr Threonine
483  valid_chars[(uint8_t) 'W']=true; //W Trp Tryptophan
484  valid_chars[(uint8_t) 'Y']=true; //Y Tyr Tyrosine
485  valid_chars[(uint8_t) 'V']=true; //V Val Valine
486  valid_chars[(uint8_t) 'B']=true; //B Asx Aspartic acid or Asparagine
487  valid_chars[(uint8_t) 'Z']=true; //Z Glx Glutamine or Glutamic acid
488  valid_chars[(uint8_t) 'X']=true; //X Xaa Any amino acid
489 
490  maptable_to_bin[(uint8_t) 'A']=0; //A Ala Alanine
491  maptable_to_bin[(uint8_t) 'R']=1; //R Arg Arginine
492  maptable_to_bin[(uint8_t) 'N']=2; //N Asn Asparagine
493  maptable_to_bin[(uint8_t) 'D']=3; //D Asp Aspartic acid
494  maptable_to_bin[(uint8_t) 'C']=4; //C Cys Cysteine
495  maptable_to_bin[(uint8_t) 'Q']=5; //Q Gln Glutamine
496  maptable_to_bin[(uint8_t) 'E']=6; //E Glu Glutamic acid
497  maptable_to_bin[(uint8_t) 'G']=7; //G Gly Glycine
498  maptable_to_bin[(uint8_t) 'H']=8; //H His Histidine
499  maptable_to_bin[(uint8_t) 'I']=9; //I Ile Isoleucine
500  maptable_to_bin[(uint8_t) 'L']=10; //L Leu Leucine
501  maptable_to_bin[(uint8_t) 'K']=11; //K Lys Lysine
502  maptable_to_bin[(uint8_t) 'M']=12; //M Met Methionine
503  maptable_to_bin[(uint8_t) 'F']=13; //F Phe Phenylalanine
504  maptable_to_bin[(uint8_t) 'P']=14; //P Pro Proline
505  maptable_to_bin[(uint8_t) 'S']=15; //S Ser Serine
506  maptable_to_bin[(uint8_t) 'T']=16; //T Thr Threonine
507  maptable_to_bin[(uint8_t) 'W']=17; //W Trp Tryptophan
508  maptable_to_bin[(uint8_t) 'Y']=18; //Y Tyr Tyrosine
509  maptable_to_bin[(uint8_t) 'V']=19; //V Val Valine
510  maptable_to_bin[(uint8_t) 'B']=20; //B Asx Aspartic acid or Asparagine
511  maptable_to_bin[(uint8_t) 'Z']=21; //Z Glx Glutamine or Glutamic acid
512  maptable_to_bin[(uint8_t) 'X']=22; //X Xaa Any amino acid
513 
514  maptable_to_char[0]=(uint8_t) 'A'; //A Ala Alanine
515  maptable_to_char[1]=(uint8_t) 'R'; //R Arg Arginine
516  maptable_to_char[2]=(uint8_t) 'N'; //N Asn Asparagine
517  maptable_to_char[3]=(uint8_t) 'D'; //D Asp Aspartic acid
518  maptable_to_char[4]=(uint8_t) 'C'; //C Cys Cysteine
519  maptable_to_char[5]=(uint8_t) 'Q'; //Q Gln Glutamine
520  maptable_to_char[6]=(uint8_t) 'E'; //E Glu Glutamic acid
521  maptable_to_char[7]=(uint8_t) 'G'; //G Gly Glycine
522  maptable_to_char[8]=(uint8_t) 'H'; //H His Histidine
523  maptable_to_char[9]=(uint8_t) 'I'; //I Ile Isoleucine
524  maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
525  maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
526  maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
527  maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
528  maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
529  maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
530  maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
531  maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
532  maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
533  maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
534  maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
535  maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
536  maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
537  break;
538 
539  default:
540  break; //leave uninitialised
541  };
542 }
543 
545 {
546  memset(histogram, 0, sizeof(histogram));
547  print_histogram();
548 }
549 
551 {
552  int32_t max_sym=-1;
553  for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
554  {
555  if (histogram[i])
556  {
557  max_sym=i;
558  break;
559  }
560  }
561 
562  return max_sym;
563 }
564 
566 {
567  int32_t num_sym=0;
568  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
569  {
570  if (histogram[i])
571  num_sym++;
572  }
573 
574  return num_sym;
575 }
576 
578 {
579  int32_t num_sym=get_num_symbols_in_histogram();
580  if (num_sym>0)
581  return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
582  else
583  return 0;
584 }
585 
586 
588 {
589  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
590  {
591  if (histogram[i])
592  {
593  if (isprint(i))
594  SG_PRINT("hist['%c']=%lld", i, histogram[i])
595  else if (i == '\t')
596  SG_PRINT("hist['\\t']=%lld", histogram[i])
597  else if (i == '\n')
598  SG_PRINT("hist['\\n']=%lld", histogram[i])
599  else if (i == '\r')
600  SG_PRINT("hist['\\r']=%lld", histogram[i])
601  else
602  SG_PRINT("hist[%d]=%lld", i, histogram[i])
603 
604  if (!valid_chars[i])
605  SG_PRINT(" - Character not in Alphabet.\n")
606  else
607  SG_PRINT("\n");
608  }
609  }
610 }
611 
613 {
614  return SGVector<int64_t>(&histogram[0], 1 << (sizeof(uint8_t)*8), false);
615 }
616 
617 bool CAlphabet::check_alphabet(bool print_error)
618 {
619  bool result = true;
620 
621  for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
622  {
623  if (histogram[i]>0 && valid_chars[i]==0)
624  {
625  result=false;
626  break;
627  }
628  }
629 
630  if (!result && print_error)
631  {
632  print_histogram();
633  SG_ERROR("ALPHABET does not contain all symbols in histogram\n")
634  }
635 
636  return result;
637 }
638 
639 bool CAlphabet::check_alphabet_size(bool print_error)
640 {
642  {
643  if (print_error)
644  {
645  print_histogram();
646  fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
647  SG_ERROR("ALPHABET too small to contain all symbols in histogram\n")
648  }
649  return false;
650  }
651  else
652  return true;
653 
654 }
655 
657 {
659 
660  if (h.vlen != sizeof(histogram)/sizeof(histogram[0]))
661  {
662  SG_ERROR("Histogram has %d elements, but %d elements where expected\n",
663  h.vlen, sizeof(histogram)/sizeof(histogram[0]));
664  }
665 
666  memcpy(histogram, h.vector, sizeof(histogram));
667 }
668 
670 {
671 
672  int32_t idx;
673  switch (alphabet)
674  {
675  case DNA:
676  idx=0;
677  break;
678  case RAWDNA:
679  idx=1;
680  break;
681  case RNA:
682  idx=2;
683  break;
684  case PROTEIN:
685  idx=3;
686  break;
687  case BINARY:
688  idx=4;
689  break;
690  case ALPHANUM:
691  idx=5;
692  break;
693  case CUBE:
694  idx=6;
695  break;
696  case RAWBYTE:
697  idx=7;
698  break;
699  case IUPAC_NUCLEIC_ACID:
700  idx=8;
701  break;
702  case IUPAC_AMINO_ACID:
703  idx=9;
704  break;
705  case NONE:
706  idx=10;
707  break;
708  case DIGIT:
709  idx=11;
710  break;
711  case DIGIT2:
712  idx=12;
713  break;
714  default:
715  idx=13;
716  break;
717  }
718  return alphabet_names[idx];
719 }
720 
721 void CAlphabet::init()
722 {
723  alphabet = NONE;
724  num_symbols = 0;
725  num_bits = 0;
726 
727  memset(valid_chars, 0, sizeof (valid_chars));
728  memset(maptable_to_bin, 0, sizeof (maptable_to_bin));
729  memset(maptable_to_char, 0, sizeof (maptable_to_char));
730  memset(histogram, 0, sizeof (histogram));
731 
732 
733  m_parameters->add((machine_int_t*) &alphabet, "alphabet",
734  "Alphabet enum.");
735  m_parameters->add(&num_symbols, "num_symbols",
736  "Number of symbols.");
737  m_parameters->add(&num_bits, "num_bits",
738  "Number of bits.");
739 
740  /* We don't need to serialize the mapping tables / they can be computed
741  * after de-serializing. Lets not serialize the histogram for now. Doesn't
742  * really make sense. */
743 
744  /* m_parameters->add_histogram(&histogram, sizeof(histogram),
745  "histogram",
746  "Histogram."); */
747 }
748 
750 {
752 
753  init_map_table();
754 }
755 
756 
757 namespace shogun
758 {
759 template <class ST>
760 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
761 {
762  int32_t i,j;
763  ST value=0;
764 
765  for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
766  {
767  value=0;
768  for (j=i; j>=i-p_order+1; j--)
769  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
770 
771  obs[i]= (ST) value;
772  }
773 
774  for (i=p_order-2;i>=0;i--)
775  {
776  if (i>=sequence_length)
777  continue;
778 
779  value=0;
780  for (j=i; j>=i-p_order+1; j--)
781  {
782  value= (value >> max_val);
783  if (j>=0 && j<sequence_length)
784  value|=obs[j] << (max_val * (p_order-1));
785  }
786  obs[i]=value;
787  }
788 
789  // TODO we should get rid of this loop!
790  if (start>0)
791  {
792  for (i=start; i<sequence_length; i++)
793  obs[i-start]=obs[i];
794  }
795 }
796 
797 template <class ST>
798 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
799 {
800  int32_t i,j;
801  ST value=0;
802 
803  for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
804  {
805  value=0;
806  for (j=i; j>=i-p_order+1; j--)
807  value= (value << max_val) | obs[j];
808 
809  obs[i]= (ST) value;
810  }
811 
812  for (i=p_order-2;i>=0;i--)
813  {
814  if (i>=sequence_length)
815  continue;
816 
817  value=0;
818  for (j=i; j>=i-p_order+1; j--)
819  {
820  value= (value << max_val);
821  if (j>=0 && j<sequence_length)
822  value|=obs[j];
823  }
824  obs[i]=value;
825  }
826 
827  // TODO we should get rid of this loop!
828  if (start>0)
829  {
830  for (i=start; i<sequence_length; i++)
831  obs[i-start]=obs[i];
832  }
833 }
834 
835 template <class ST>
836 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
837 {
838  ASSERT(gap>=0)
839 
840  const int32_t start_gap=(p_order-gap)/2;
841  const int32_t end_gap=start_gap+gap;
842 
843  int32_t i,j;
844  ST value=0;
845 
846  // almost all positions
847  for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
848  {
849  value=0;
850  for (j=i; j>=i-p_order+1; j--)
851  {
852  if (i-j<start_gap)
853  {
854  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
855  }
856  else if (i-j>=end_gap)
857  {
858  value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
859  }
860  }
861  obs[i]= (ST) value;
862  }
863 
864  // the remaining `order` positions
865  for (i=p_order-2;i>=0;i--)
866  {
867  if (i>=sequence_length)
868  continue;
869 
870  value=0;
871  for (j=i; j>=i-p_order+1; j--)
872  {
873  if (i-j<start_gap)
874  {
875  value= (value >> max_val);
876  if (j>=0 && j<sequence_length)
877  value|=obs[j] << (max_val * (p_order-1-gap));
878  }
879  else if (i-j>=end_gap)
880  {
881  value= (value >> max_val);
882  if (j>=0 && j<sequence_length)
883  value|=obs[j] << (max_val * (p_order-1-gap));
884  }
885  }
886  obs[i]=value;
887  }
888 
889  // TODO we should get rid of this loop!
890  if (start>0)
891  {
892  for (i=start; i<sequence_length; i++)
893  obs[i-start]=obs[i];
894  }
895 }
896 
897 template <class ST>
898 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
899 {
900  ASSERT(gap>=0)
901 
902  const int32_t start_gap=(p_order-gap)/2;
903  const int32_t end_gap=start_gap+gap;
904 
905  int32_t i,j;
906  ST value=0;
907 
908  // almost all positions
909  for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
910  {
911  value=0;
912  for (j=i; j>=i-p_order+1; j--)
913  {
914  if (i-j<start_gap)
915  value= (value << max_val) | obs[j];
916  else if (i-j>=end_gap)
917  value= (value << max_val) | obs[j];
918  }
919  obs[i]= (ST) value;
920  }
921 
922  // the remaining `order` positions
923  for (i=p_order-2;i>=0;i--)
924  {
925  if (i>=sequence_length)
926  continue;
927 
928  value=0;
929  for (j=i; j>=i-p_order+1; j--)
930  {
931  if (i-j<start_gap)
932  {
933  value= value << max_val;
934  if (j>=0 && j<sequence_length)
935  value|=obs[j];
936  }
937  else if (i-j>=end_gap)
938  {
939  value= value << max_val;
940  if (j>=0 && j<sequence_length)
941  value|=obs[j];
942  }
943  }
944  obs[i]=value;
945  }
946 
947  // TODO we should get rid of this loop!
948  if (start>0)
949  {
950  for (i=start; i<sequence_length; i++)
951  obs[i-start]=obs[i];
952  }
953 }
954 
955 template<> void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
956 {
957 }
958 
959 template<> void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
960 {
961 }
962 
963 template<> void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
964 {
965 }
966 
967 template<> void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
968 {
969 }
970 
971 template<> void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
972 {
973 }
974 
975 template<> void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
976 {
977 }
978 
979 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
980 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
981 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
982 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
983 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
984 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
985 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
986 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
987 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
988 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
989 
990 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
991 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
992 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
993 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
994 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
995 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
996 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
997 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
998 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
999 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1000 
1001 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1002 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1003 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1004 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1005 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1006 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1007 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1008 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1009 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1010 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
1011 
1012 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1013 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1014 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1015 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1016 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1017 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1018 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1019 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1020 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1021 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1022 template void CAlphabet::translate_from_single_order_reversed<float32_t>(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1023 template void CAlphabet::translate_from_single_order_reversed<float64_t>(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1024 template void CAlphabet::translate_from_single_order_reversed<floatmax_t>(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
1025 }

SHOGUN Machine Learning Toolbox - Documentation