SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
PositionalPWM.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Soeren Sonnenburg
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
12 #include <shogun/base/Parameter.h>
15 
16 using namespace shogun;
17 
19  m_sigma(0), m_mean(0)
20 {
24 
25  register_params();
26 }
27 
29 {
30 }
31 
33 {
35  return true;
36 }
37 
39 {
40  return m_pwm.num_rows*m_pwm.num_cols+2;
41 }
42 
44 {
45  ASSERT(num_param>0 && num_param<=m_pwm.num_rows*m_pwm.num_cols+2)
46 
47  if (num_param<m_pwm.num_rows*m_pwm.num_cols)
48  {
49  return m_pwm[num_param];
50  }
51  else if (num_param<m_pwm.num_rows*m_pwm.num_cols+1)
52  return CMath::log(m_sigma);
53  else
54  return CMath::log(m_mean);
55 }
56 
57 float64_t CPositionalPWM::get_log_derivative(int32_t num_param, int32_t num_example)
58 {
60  return 0;
61 }
62 
64 {
68 
70 
71  float64_t lik=0;
72  int32_t len=0;
73  bool do_free=false;
74 
75  uint8_t* str = strs->get_feature_vector(num_example, len, do_free);
76 
77  if (!(m_w.num_cols==len))
78  return 0; //TODO
79 
80  for (int32_t i=0; i<len; i++)
81  lik+=m_w[4*i+str[i]];
82 
83  strs->free_feature_vector(str, num_example, do_free);
84  return lik;
85 }
86 
88 {
89  ASSERT(m_pwm.num_cols == len)
90  float64_t score = CMath::log(1/(m_sigma*CMath::sqrt(2*M_PI))) -
92 
93  for (int32_t i=0; i<m_pwm.num_cols; i++)
94  score+=m_pwm[m_pwm.num_rows*i+window[i]];
95 
96  return score;
97 }
98 
99 void CPositionalPWM::compute_w(int32_t num_pos)
100 {
102 
103  int32_t m_w_rows = CMath::pow(m_pwm.num_rows, m_pwm.num_cols);
104  int32_t m_w_cols = num_pos;
105 
106  m_w = SGMatrix<float64_t>(m_w_cols,m_w_rows);
107 
108  uint8_t* window=SG_MALLOC(uint8_t, m_pwm.num_cols);
109  SGVector<uint8_t>::fill_vector(window, m_pwm.num_cols, (uint8_t) 0);
110 
111  const int32_t last_idx=m_pwm.num_cols-1;
112  for (int32_t i=0; i<m_w_rows; i++)
113  {
114  for (int32_t j=0; j<m_w_cols; j++)
115  m_w[j*m_w_rows+i]=get_log_likelihood_window(window, m_pwm.num_cols, j);
116 
117  window[last_idx]++;
118  int32_t window_ptr=last_idx;
119  while (window[window_ptr]==m_pwm.num_rows && window_ptr>0)
120  {
121  window[window_ptr]=0;
122  window_ptr--;
123  window[window_ptr]++;
124  }
125 
126  }
127 }
128 
129 void CPositionalPWM::register_params()
130 {
131  m_parameters->add(&m_poim, "poim", "POIM Scoring Matrix");
132  m_parameters->add(&m_w, "w", "Scoring Matrix");
133  m_parameters->add(&m_pwm, "pwm", "Positional Weight Matrix.");
134  m_parameters->add(&m_sigma, "sigma", "Standard Deviation.");
135  m_parameters->add(&m_mean, "mean", "Mean.");
136 }
137 
138 void CPositionalPWM::compute_scoring(int32_t max_degree)
139 {
140  int32_t num_feat=m_w.num_cols;
141  int32_t num_sym=0;
142  int32_t order=m_pwm.num_rows;
143  int32_t num_words=m_pwm.num_cols;
144 
145  CAlphabet* alpha=new CAlphabet(DNA);
147  int32_t num_bits=alpha->get_num_bits();
148  str->compute_symbol_mask_table(num_bits);
149 
150  for (int32_t i=0; i<order; i++)
151  num_sym+=CMath::pow((int32_t) num_words,i+1);
152 
153  m_poim = SGVector<float64_t>(num_feat*num_sym);
154  memset(m_poim.vector,0, size_t(num_feat)*size_t(num_sym));
155 
156  uint32_t kmer_mask=0;
157  uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
158  int32_t offset=0;
159 
160  for (int32_t o=0; o<max_degree; o++)
161  {
162  float64_t* contrib=&m_poim[offset];
163  offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
164 
165  kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
166 
167  for (int32_t p=-o; p<order; p++)
168  {
169  int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
170  uint32_t imer_mask=kmer_mask;
171  uint32_t jmer_mask=kmer_mask;
172 
173  if (p<0)
174  {
175  il=-p;
176  m_sym=order-o-p-1;
177  o_sym=-p;
178  }
179  else if (p<order-o)
180  {
181  ir=p;
182  m_sym=order-o-1;
183  }
184  else
185  {
186  ir=p;
187  m_sym=p;
188  o_sym=p-order+o+1;
189  jl=order-ir;
190  imer_mask=(kmer_mask>>(num_bits*o_sym));
191  jmer_mask=(kmer_mask>>(num_bits*jl));
192  }
193 
194  float64_t marginalizer=
195  1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
196 
197  for (uint32_t i=0; i<words; i++)
198  {
199  uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
200 
201  if (p>=0 && p<order-o)
202  {
203  contrib[x]+=m_w[m_w.num_cols*ir+i]*marginalizer;
204  }
205  else
206  {
207  for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
208  {
209  uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
210  contrib[c]+=m_w[m_w.num_cols*il+i]*marginalizer;
211  }
212  }
213  }
214  }
215  }
216 }
217 
219 {
220  int32_t offs=0;
221  for (int32_t i=0; i<d-1; i++)
222  offs+=CMath::pow((int32_t) m_w.num_rows,i+1);
223  int32_t rows=CMath::pow((int32_t) m_w.num_rows,d);
224  int32_t cols=m_w.num_cols;
225  float64_t* scoring_matrix = SG_MALLOC(float64_t, rows*cols);
226  memcpy(scoring_matrix,m_poim.vector+offs,rows*cols*sizeof(float64_t));
227  return SGMatrix<float64_t>(scoring_matrix,rows,cols);
228 }
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:223
virtual float64_t get_log_derivative(int32_t num_param, int32_t num_example)
SGMatrix< float64_t > m_pwm
static T sq(T x)
Definition: Math.h:450
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
Parameter * m_parameters
Definition: SGObject.h:378
index_t num_cols
Definition: SGMatrix.h:378
Base class Distribution from which all methods implementing a distribution are derived.
Definition: Distribution.h:44
index_t num_rows
Definition: SGMatrix.h:376
virtual float64_t get_log_model_parameter(int32_t num_param)
void add(bool *param, const char *name, const char *description="")
Definition: Parameter.cpp:37
virtual int32_t get_num_model_parameters()
ST get_masked_symbols(ST symbol, uint8_t mask)
#define ASSERT(x)
Definition: SGIO.h:201
double float64_t
Definition: common.h:50
SGVector< float64_t > m_poim
virtual EFeatureClass get_feature_class() const =0
#define M_PI
workaround for log2 being a define on cygwin
Definition: Math.h:59
void compute_symbol_mask_table(int64_t max_val)
int32_t get_num_bits() const
Definition: Alphabet.h:149
virtual float64_t get_log_likelihood_example(int32_t num_example)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual SGMatrix< float64_t > get_scoring(int32_t d)
The class Features is the base class of all feature objects.
Definition: Features.h:68
void compute_w(int32_t num_pos)
static float64_t log(float64_t v)
Definition: Math.h:922
float64_t get_log_likelihood_window(uint8_t *window, int32_t len, float64_t pos)
static float32_t sqrt(float32_t x)
Definition: Math.h:459
SGMatrix< float64_t > m_w
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
void compute_scoring(int32_t max_degree)
virtual EFeatureType get_feature_type() const =0
virtual bool train(CFeatures *data=NULL)

SHOGUN Machine Learning Toolbox - Documentation