Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include <shogun/base/SGObject.h>
00015 #include <shogun/lib/common.h>
00016 
00017 namespace shogun
00018 {
00020 enum EAlphabet
00021 {
00023     DNA=0,
00024 
00026     RAWDNA=1,
00027 
00029     RNA=2,
00030 
00032     PROTEIN=3,
00033 
00034     // BINARY just 0 and 1
00035     BINARY=4,
00036 
00038     ALPHANUM=5,
00039 
00041     CUBE=6,
00042 
00044     RAWBYTE=7,
00045 
00047     IUPAC_NUCLEIC_ACID=8,
00048 
00050     IUPAC_AMINO_ACID=9,
00051 
00053     NONE=10,
00054 
00056     DIGIT=11,
00057 
00059     DIGIT2=12,
00060 
00062     RAWDIGIT=13,
00063 
00065     RAWDIGIT2=14,
00066 
00068     UNKNOWN=15,
00069 
00071     SNP=16,
00072 
00074     RAWSNP=17
00075 };
00076 
00077 
00088 class CAlphabet : public CSGObject
00089 {
00090     public:
00091 
00095         CAlphabet();
00096 
00102         CAlphabet(char* alpha, int32_t len);
00103 
00108         CAlphabet(EAlphabet alpha);
00109 
00114         CAlphabet(CAlphabet* alpha);
00115         virtual ~CAlphabet();
00116 
00121         bool set_alphabet(EAlphabet alpha);
00122 
00127         inline EAlphabet get_alphabet() const
00128         {
00129             return alphabet;
00130         }
00131 
00136         inline int32_t get_num_symbols() const
00137         {
00138             return num_symbols;
00139         }
00140 
00146         inline int32_t get_num_bits() const
00147         {
00148             return num_bits;
00149         }
00150 
00156         inline uint8_t remap_to_bin(uint8_t c)
00157         {
00158             return maptable_to_bin[c];
00159         }
00160 
00166         inline uint8_t remap_to_char(uint8_t c)
00167         {
00168             return maptable_to_char[c];
00169         }
00170 
00172         void clear_histogram();
00173 
00179         template <class T>
00180         void add_string_to_histogram(T* p, int64_t len)
00181         {
00182             for (int64_t i=0; i<len; i++)
00183                 add_byte_to_histogram((uint8_t) (p[i]));
00184         }
00185 
00190         inline void add_byte_to_histogram(uint8_t p)
00191         {
00192             histogram[p]++;
00193         }
00194 
00196         void print_histogram();
00197 
00202         SGVector<int64_t> get_histogram();
00203 
00210         bool check_alphabet(bool print_error=true);
00211 
00218         inline bool is_valid(uint8_t c)
00219         {
00220             return valid_chars[c];
00221         }
00222 
00228         bool check_alphabet_size(bool print_error=true);
00229 
00234         int32_t get_num_symbols_in_histogram();
00235 
00240         int32_t get_max_value_in_histogram();
00241 
00248         int32_t get_num_bits_in_histogram();
00249 
00254         static const char* get_alphabet_name(EAlphabet alphabet);
00255 
00256 
00258         inline virtual const char* get_name() const { return "Alphabet"; }
00259 
00268         template <class ST>
00269         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00270 
00279         template <class ST>
00280         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
00281 
00291         template <class ST>
00292         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00293 
00303         template <class ST>
00304         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
00305 
00306     private:
00309         void init();
00310 
00311     protected:
00313         void init_map_table();
00314 
00319         void copy_histogram(CAlphabet* src);
00320 
00321     public:
00323         static const uint8_t B_A;
00325         static const uint8_t B_C;
00327         static const uint8_t B_G;
00329         static const uint8_t B_T;
00331         static const uint8_t B_0;
00333         static const uint8_t MAPTABLE_UNDEF;
00335         static const char* alphabet_names[18];
00336 
00337     protected:
00346         virtual void load_serializable_post() throw (ShogunException);
00347 
00348     protected:
00350         EAlphabet alphabet;
00352         int32_t num_symbols;
00354         int32_t num_bits;
00356         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00358         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00360         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00362         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00363 };
00364 }
00365 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation