SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Alphabet.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #ifndef _CALPHABET__H__
12 #define _CALPHABET__H__
13 
14 #include <shogun/lib/config.h>
15 
16 #include <shogun/base/SGObject.h>
17 #include <shogun/lib/common.h>
18 #include <shogun/lib/SGVector.h>
19 
20 namespace shogun
21 {
24 {
26  DNA=0,
27 
29  RAWDNA=1,
30 
32  RNA=2,
33 
36 
37  // BINARY just 0 and 1
38  BINARY=4,
39 
42 
44  CUBE=6,
45 
48 
51 
54 
56  NONE=10,
57 
59  DIGIT=11,
60 
62  DIGIT2=12,
63 
66 
69 
71  UNKNOWN=15,
72 
74  SNP=16,
75 
77  RAWSNP=17
78 };
79 
80 
91 class CAlphabet : public CSGObject
92 {
93  public:
94 
98  CAlphabet();
99 
105  CAlphabet(char* alpha, int32_t len);
106 
111  CAlphabet(EAlphabet alpha);
112 
117  CAlphabet(CAlphabet* alpha);
118  virtual ~CAlphabet();
119 
124  bool set_alphabet(EAlphabet alpha);
125 
130  inline EAlphabet get_alphabet() const
131  {
132  return alphabet;
133  }
134 
139  inline int32_t get_num_symbols() const
140  {
141  return num_symbols;
142  }
143 
149  inline int32_t get_num_bits() const
150  {
151  return num_bits;
152  }
153 
159  inline uint8_t remap_to_bin(uint8_t c)
160  {
161  return maptable_to_bin[c];
162  }
163 
169  inline uint8_t remap_to_char(uint8_t c)
170  {
171  return maptable_to_char[c];
172  }
173 
175  void clear_histogram();
176 
182  template <class T>
183  void add_string_to_histogram(T* p, int64_t len)
184  {
185  for (int64_t i=0; i<len; i++)
186  add_byte_to_histogram((uint8_t) (p[i]));
187  }
188 
193  inline void add_byte_to_histogram(uint8_t p)
194  {
195  histogram[p]++;
196  }
197 
199  void print_histogram();
200 
206 
213  bool check_alphabet(bool print_error=true);
214 
221  inline bool is_valid(uint8_t c)
222  {
223  return valid_chars[c];
224  }
225 
231  bool check_alphabet_size(bool print_error=true);
232 
238 
243  int32_t get_max_value_in_histogram();
244 
251  int32_t get_num_bits_in_histogram();
252 
257  static const char* get_alphabet_name(EAlphabet alphabet);
258 
259 
261  virtual const char* get_name() const { return "Alphabet"; }
262 
271  template <class ST>
272  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
273 
282  template <class ST>
283  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
284 
294  template <class ST>
295  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
296 
306  template <class ST>
307  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
308 
309  private:
312  void init();
313 
314  protected:
316  void init_map_table();
317 
322  void copy_histogram(CAlphabet* src);
323 
324  public:
326  static const uint8_t B_A;
328  static const uint8_t B_C;
330  static const uint8_t B_G;
332  static const uint8_t B_T;
334  static const uint8_t B_0;
336  static const uint8_t MAPTABLE_UNDEF;
338  static const char* alphabet_names[18];
339 
340  protected:
349  virtual void load_serializable_post() throw (ShogunException);
350 
351  protected:
355  int32_t num_symbols;
357  int32_t num_bits;
359  bool valid_chars[1 << (sizeof(uint8_t)*8)];
361  uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
363  uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
365  int64_t histogram[1 << (sizeof(uint8_t)*8)];
366 };
367 }
368 #endif

SHOGUN Machine Learning Toolbox - Documentation