00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/distributions/PositionalPWM.h>
00011 #include <shogun/mathematics/Math.h>
00012 #include <shogun/base/Parameter.h>
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/features/StringFeatures.h>
00015
00016 using namespace shogun;
00017
00018 CPositionalPWM::CPositionalPWM() : CDistribution(),
00019 m_pwm_rows(0), m_pwm_cols(0), m_pwm(NULL),
00020 m_sigma(0), m_mean(0),
00021 m_w_rows(0), m_w_cols(0), m_w(NULL), m_poim(NULL)
00022
00023 {
00024 register_params();
00025 }
00026
00027 CPositionalPWM::~CPositionalPWM()
00028 {
00029 SG_FREE(m_pwm);
00030 SG_FREE(m_w);
00031 }
00032
00033 bool CPositionalPWM::train(CFeatures* data)
00034 {
00035 SG_NOTIMPLEMENTED;
00036 return true;
00037 }
00038
00039 int32_t CPositionalPWM::get_num_model_parameters()
00040 {
00041 return m_pwm_rows*m_pwm_cols+2;
00042 }
00043
00044 float64_t CPositionalPWM::get_log_model_parameter(int32_t num_param)
00045 {
00046 ASSERT(num_param>0 && num_param<=m_pwm_rows*m_pwm_cols+2);
00047
00048 if (num_param<m_pwm_rows*m_pwm_cols)
00049 {
00050 ASSERT(m_pwm);
00051 return m_pwm[num_param];
00052 }
00053 else if (num_param<m_pwm_rows*m_pwm_cols+1)
00054 return CMath::log(m_sigma);
00055 else
00056 return CMath::log(m_mean);
00057 }
00058
00059 float64_t CPositionalPWM::get_log_derivative(int32_t num_param, int32_t num_example)
00060 {
00061 SG_NOTIMPLEMENTED;
00062 return 0;
00063 }
00064
00065 float64_t CPositionalPWM::get_log_likelihood_example(int32_t num_example)
00066 {
00067 ASSERT(features);
00068 ASSERT(features->get_feature_class() == C_STRING);
00069 ASSERT(features->get_feature_type()==F_BYTE);
00070
00071 CStringFeatures<uint8_t>* strs=(CStringFeatures<uint8_t>*) features;
00072
00073 float64_t lik=0;
00074 int32_t len=0;
00075 bool do_free=false;
00076
00077 uint8_t* str = strs->get_feature_vector(num_example, len, do_free);
00078
00079 if (!(m_w && m_w_cols==len))
00080 return 0;
00081
00082 for (int32_t i=0; i<len; i++)
00083 lik+=m_w[4*i+str[i]];
00084
00085 strs->free_feature_vector(str, num_example, do_free);
00086 return lik;
00087 }
00088
00089 float64_t CPositionalPWM::get_log_likelihood_window(uint8_t* window, int32_t len, float64_t pos)
00090 {
00091 ASSERT(m_pwm_cols == len);
00092 float64_t score = CMath::log(1/(m_sigma*CMath::sqrt(2*M_PI))) -
00093 CMath::sq(pos-m_mean)/(2*CMath::sq(m_sigma));
00094
00095 for (int32_t i=0; i<m_pwm_cols; i++)
00096 score+=m_pwm[m_pwm_rows*i+window[i]];
00097
00098 return score;
00099 }
00100
00101 void CPositionalPWM::compute_w(int32_t num_pos)
00102 {
00103 ASSERT(m_pwm && m_pwm_rows && m_pwm_cols);
00104
00105 m_w_rows=CMath::pow(m_pwm_rows, m_pwm_cols);
00106 m_w_cols=num_pos;
00107
00108 SG_FREE(m_w);
00109 m_w=SG_MALLOC(float64_t, m_w_cols*m_w_rows);
00110
00111 uint8_t* window=SG_MALLOC(uint8_t, m_pwm_cols);
00112 CMath::fill_vector(window, m_pwm_cols, (uint8_t) 0);
00113
00114 const int32_t last_idx=m_pwm_cols-1;
00115 for (int32_t i=0; i<m_w_rows; i++)
00116 {
00117 for (int32_t j=0; j<m_w_cols; j++)
00118 m_w[j*m_w_rows+i]=get_log_likelihood_window(window, m_pwm_cols, j);
00119
00120 window[last_idx]++;
00121 int32_t window_ptr=last_idx;
00122 while (window[window_ptr]==m_pwm_rows && window_ptr>0)
00123 {
00124 window[window_ptr]=0;
00125 window_ptr--;
00126 window[window_ptr]++;
00127 }
00128
00129 }
00130 }
00131
00132 void CPositionalPWM::register_params()
00133 {
00134 m_parameters->add_vector(&m_poim, &m_poim_len, "poim", "POIM Scoring Matrix");
00135 m_parameters->add_matrix(&m_w, &m_w_rows, &m_w_cols, "w", "Scoring Matrix");
00136 m_parameters->add_matrix(&m_pwm, &m_pwm_rows, &m_pwm_cols, "pwm", "Positional Weight Matrix.");
00137 m_parameters->add(&m_sigma, "sigma", "Standard Deviation.");
00138 m_parameters->add(&m_mean, "mean", "Mean.");
00139 }
00140
00141 void CPositionalPWM::compute_scoring(int32_t max_degree)
00142 {
00143 ASSERT(m_w);
00144
00145 int32_t num_feat=m_w_cols;
00146 int32_t num_sym=0;
00147 int32_t order=m_pwm_cols;
00148 int32_t num_words=m_pwm_cols;
00149
00150 CAlphabet* alpha=new CAlphabet(DNA);
00151 CStringFeatures<uint16_t>* str= new CStringFeatures<uint16_t>(alpha);
00152 int32_t num_bits=alpha->get_num_bits();
00153 str->compute_symbol_mask_table(num_bits);
00154
00155 for (int32_t i=0; i<order; i++)
00156 num_sym+=CMath::pow((int32_t) num_words,i+1);
00157
00158 SG_FREE(m_poim);
00159 m_poim_len=num_feat*num_sym;
00160 m_poim=SG_MALLOC(float64_t, num_feat*num_sym);
00161 memset(m_poim,0, size_t(num_feat)*size_t(num_sym));
00162
00163 uint32_t kmer_mask=0;
00164 uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
00165 int32_t offset=0;
00166
00167 for (int32_t o=0; o<max_degree; o++)
00168 {
00169 float64_t* contrib=&m_poim[offset];
00170 offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
00171
00172 kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
00173
00174 for (int32_t p=-o; p<order; p++)
00175 {
00176 int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
00177 uint32_t imer_mask=kmer_mask;
00178 uint32_t jmer_mask=kmer_mask;
00179
00180 if (p<0)
00181 {
00182 il=-p;
00183 m_sym=order-o-p-1;
00184 o_sym=-p;
00185 }
00186 else if (p<order-o)
00187 {
00188 ir=p;
00189 m_sym=order-o-1;
00190 }
00191 else
00192 {
00193 ir=p;
00194 m_sym=p;
00195 o_sym=p-order+o+1;
00196 jl=order-ir;
00197 imer_mask=(kmer_mask>>(num_bits*o_sym));
00198 jmer_mask=(kmer_mask>>(num_bits*jl));
00199 }
00200
00201 float64_t marginalizer=
00202 1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
00203
00204 for (uint32_t i=0; i<words; i++)
00205 {
00206 uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
00207
00208 if (p>=0 && p<order-o)
00209 {
00210 contrib[x]+=m_w[m_w_cols*ir+i]*marginalizer;
00211 }
00212 else
00213 {
00214 for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
00215 {
00216 uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
00217 contrib[c]+=m_w[m_w_cols*il+i]*marginalizer;
00218 }
00219 }
00220 }
00221 }
00222 }
00223 }
00224
00225 SGMatrix<float64_t> CPositionalPWM::get_scoring(int32_t d)
00226 {
00227 int32_t offs=0;
00228 for (int32_t i=0; i<d-1; i++)
00229 offs+=CMath::pow((int32_t) m_w_rows,i+1);
00230 int32_t rows=CMath::pow((int32_t) m_w_rows,d);
00231 int32_t cols=m_w_cols;
00232 return SGMatrix<float64_t>(&m_poim[offs],rows,cols);
00233 }