SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Alphabet.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #ifndef _CALPHABET__H__
12 #define _CALPHABET__H__
13 
14 #include <shogun/lib/config.h>
15 
16 #include <shogun/base/SGObject.h>
17 #include <shogun/lib/common.h>
18 #include <shogun/lib/SGVector.h>
19 
20 namespace shogun
21 {
24 {
26  DNA=0,
27 
29  RAWDNA=1,
30 
32  RNA=2,
33 
36 
37  // BINARY just 0 and 1
38  BINARY=4,
39 
42 
44  CUBE=6,
45 
48 
51 
54 
56  NONE=10,
57 
59  DIGIT=11,
60 
62  DIGIT2=12,
63 
66 
69 
71  UNKNOWN=15,
72 
74  SNP=16,
75 
77  RAWSNP=17
78 };
79 
80 
91 class CAlphabet : public CSGObject
92 {
93  public:
94 
98  CAlphabet();
99 
105  CAlphabet(char* alpha, int32_t len);
106 
111  CAlphabet(EAlphabet alpha);
112 
117  CAlphabet(CAlphabet* alpha);
118  virtual ~CAlphabet();
119 
124  bool set_alphabet(EAlphabet alpha);
125 
130  inline EAlphabet get_alphabet() const
131  {
132  return alphabet;
133  }
134 
139  inline int32_t get_num_symbols() const
140  {
141  return num_symbols;
142  }
143 
149  inline int32_t get_num_bits() const
150  {
151  return num_bits;
152  }
153 
159  inline uint8_t remap_to_bin(uint8_t c)
160  {
161  return maptable_to_bin[c];
162  }
163 
169  inline uint8_t remap_to_char(uint8_t c)
170  {
171  return maptable_to_char[c];
172  }
173 
175  void clear_histogram();
176 
182  template <class T>
183  void add_string_to_histogram(T* p, int64_t len)
184  {
185  for (int64_t i=0; i<len; i++)
186  add_byte_to_histogram((uint8_t) (p[i]));
187  }
188 
193  inline void add_byte_to_histogram(uint8_t p)
194  {
195  histogram[p]++;
196  }
197 
199  void print_histogram();
200 
206 
213  bool check_alphabet(bool print_error=true);
214 
221  inline bool is_valid(uint8_t c)
222  {
223  return valid_chars[c];
224  }
225 
231  bool check_alphabet_size(bool print_error=true);
232 
238 
243  int32_t get_max_value_in_histogram();
244 
251  int32_t get_num_bits_in_histogram();
252 
257  static const char* get_alphabet_name(EAlphabet alphabet);
258 
259 
261  virtual const char* get_name() const { return "Alphabet"; }
262 
271  template <class ST>
272  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
273 
282  template <class ST>
283  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
284 
294  template <class ST>
295  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
296 
306  template <class ST>
307  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
308 
309  private:
312  void init();
313 
314  protected:
316  void init_map_table();
317 
322  void copy_histogram(CAlphabet* src);
323 
324  public:
326  static const uint8_t B_A;
328  static const uint8_t B_C;
330  static const uint8_t B_G;
332  static const uint8_t B_T;
334  static const uint8_t B_0;
336  static const uint8_t MAPTABLE_UNDEF;
338  static const char* alphabet_names[18];
339 
340  protected:
349  virtual void load_serializable_post() throw (ShogunException);
350 
351  protected:
355  int32_t num_symbols;
357  int32_t num_bits;
359  bool valid_chars[1 << (sizeof(uint8_t)*8)];
361  uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
363  uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
365  int64_t histogram[1 << (sizeof(uint8_t)*8)];
366 };
367 }
368 #endif
RNA - letters A,C,G,U.
Definition: Alphabet.h:32
bool valid_chars[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:359
RAWDIGIT - 0-9.
Definition: Alphabet.h:65
PROTEIN - letters A-Z.
Definition: Alphabet.h:35
static const uint8_t B_T
Definition: Alphabet.h:332
int32_t get_num_symbols_in_histogram()
Definition: Alphabet.cpp:565
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
ALPHANUM - [0-9A-Z].
Definition: Alphabet.h:41
int32_t get_num_bits_in_histogram()
Definition: Alphabet.cpp:577
static const char * get_alphabet_name(EAlphabet alphabet)
Definition: Alphabet.cpp:669
SNP - letters A,C,G,T,0.
Definition: Alphabet.h:74
static const uint8_t B_G
Definition: Alphabet.h:330
RAWDNA - letters 0,1,2,3.
Definition: Alphabet.h:29
void copy_histogram(CAlphabet *src)
Definition: Alphabet.cpp:656
uint8_t maptable_to_bin[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:361
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:23
int32_t get_max_value_in_histogram()
Definition: Alphabet.cpp:550
bool check_alphabet_size(bool print_error=true)
Definition: Alphabet.cpp:639
static const uint8_t B_0
Definition: Alphabet.h:334
Class ShogunException defines an exception which is thrown whenever an error inside of shogun occurs...
void print_histogram()
print histogram
Definition: Alphabet.cpp:587
EAlphabet get_alphabet() const
Definition: Alphabet.h:130
static const uint8_t B_C
Definition: Alphabet.h:328
static void translate_from_single_order(ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
Definition: Alphabet.cpp:760
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
virtual const char * get_name() const
Definition: Alphabet.h:261
void add_byte_to_histogram(uint8_t p)
Definition: Alphabet.h:193
IUPAC_AMINO_ACID.
Definition: Alphabet.h:53
static const uint8_t MAPTABLE_UNDEF
Definition: Alphabet.h:336
int32_t num_symbols
Definition: Alphabet.h:355
static const char * alphabet_names[18]
Definition: Alphabet.h:338
virtual void load_serializable_post()
Definition: Alphabet.cpp:749
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:159
bool is_valid(uint8_t c)
Definition: Alphabet.h:221
void init_map_table()
Definition: Alphabet.cpp:179
virtual ~CAlphabet()
Definition: Alphabet.cpp:104
void add_string_to_histogram(T *p, int64_t len)
Definition: Alphabet.h:183
static void translate_from_single_order_reversed(ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
Definition: Alphabet.cpp:798
DIGIT2 - letters 0-2.
Definition: Alphabet.h:62
EAlphabet alphabet
Definition: Alphabet.h:353
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
shogun vector
RAWSNP - letters 0,1,2,3,4.
Definition: Alphabet.h:77
int32_t num_bits
Definition: Alphabet.h:357
int32_t get_num_symbols() const
Definition: Alphabet.h:139
uint8_t maptable_to_char[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:363
bool set_alphabet(EAlphabet alpha)
Definition: Alphabet.cpp:108
NONE - type has no alphabet.
Definition: Alphabet.h:56
IUPAC_NUCLEIC_ACID.
Definition: Alphabet.h:50
void clear_histogram()
clear histogram
Definition: Alphabet.cpp:544
bool check_alphabet(bool print_error=true)
Definition: Alphabet.cpp:617
SGVector< int64_t > get_histogram()
Definition: Alphabet.cpp:612
int32_t get_num_bits() const
Definition: Alphabet.h:149
unknown alphabet
Definition: Alphabet.h:71
RAWDIGIT2 - 0-2.
Definition: Alphabet.h:68
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
DIGIT - letters 0-9.
Definition: Alphabet.h:59
uint8_t remap_to_char(uint8_t c)
Definition: Alphabet.h:169
CUBE - [1-6].
Definition: Alphabet.h:44
int64_t histogram[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:365
RAW BYTE - [0-255].
Definition: Alphabet.h:47
static const uint8_t B_A
Definition: Alphabet.h:326

SHOGUN Machine Learning Toolbox - Documentation