SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Alphabet.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #ifndef _CALPHABET__H__
12 #define _CALPHABET__H__
13 
14 #include <shogun/base/SGObject.h>
15 #include <shogun/lib/common.h>
16 
17 namespace shogun
18 {
21 {
23  DNA=0,
24 
26  RAWDNA=1,
27 
29  RNA=2,
30 
33 
34  // BINARY just 0 and 1
35  BINARY=4,
36 
39 
41  CUBE=6,
42 
45 
48 
51 
53  NONE=10,
54 
56  DIGIT=11,
57 
59  DIGIT2=12,
60 
63 
66 
68  UNKNOWN=15,
69 
71  SNP=16,
72 
74  RAWSNP=17
75 };
76 
77 
88 class CAlphabet : public CSGObject
89 {
90  public:
91 
95  CAlphabet();
96 
102  CAlphabet(char* alpha, int32_t len);
103 
108  CAlphabet(EAlphabet alpha);
109 
114  CAlphabet(CAlphabet* alpha);
115  virtual ~CAlphabet();
116 
121  bool set_alphabet(EAlphabet alpha);
122 
127  inline EAlphabet get_alphabet() const
128  {
129  return alphabet;
130  }
131 
136  inline int32_t get_num_symbols() const
137  {
138  return num_symbols;
139  }
140 
146  inline int32_t get_num_bits() const
147  {
148  return num_bits;
149  }
150 
156  inline uint8_t remap_to_bin(uint8_t c)
157  {
158  return maptable_to_bin[c];
159  }
160 
166  inline uint8_t remap_to_char(uint8_t c)
167  {
168  return maptable_to_char[c];
169  }
170 
172  void clear_histogram();
173 
179  template <class T>
180  void add_string_to_histogram(T* p, int64_t len)
181  {
182  for (int64_t i=0; i<len; i++)
183  add_byte_to_histogram((uint8_t) (p[i]));
184  }
185 
190  inline void add_byte_to_histogram(uint8_t p)
191  {
192  histogram[p]++;
193  }
194 
196  void print_histogram();
197 
203 
210  bool check_alphabet(bool print_error=true);
211 
218  inline bool is_valid(uint8_t c)
219  {
220  return valid_chars[c];
221  }
222 
228  bool check_alphabet_size(bool print_error=true);
229 
235 
240  int32_t get_max_value_in_histogram();
241 
248  int32_t get_num_bits_in_histogram();
249 
254  static const char* get_alphabet_name(EAlphabet alphabet);
255 
256 
258  virtual const char* get_name() const { return "Alphabet"; }
259 
268  template <class ST>
269  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
270 
279  template <class ST>
280  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
281 
291  template <class ST>
292  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
293 
303  template <class ST>
304  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
305 
306  private:
309  void init();
310 
311  protected:
313  void init_map_table();
314 
319  void copy_histogram(CAlphabet* src);
320 
321  public:
323  static const uint8_t B_A;
325  static const uint8_t B_C;
327  static const uint8_t B_G;
329  static const uint8_t B_T;
331  static const uint8_t B_0;
333  static const uint8_t MAPTABLE_UNDEF;
335  static const char* alphabet_names[18];
336 
337  protected:
346  virtual void load_serializable_post() throw (ShogunException);
347 
348  protected:
352  int32_t num_symbols;
354  int32_t num_bits;
356  bool valid_chars[1 << (sizeof(uint8_t)*8)];
358  uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
360  uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
362  int64_t histogram[1 << (sizeof(uint8_t)*8)];
363 };
364 }
365 #endif

SHOGUN Machine Learning Toolbox - Documentation