PositionalPWM.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Soeren Sonnenburg
00008  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00009  */
00010 #include <shogun/distributions/PositionalPWM.h>
00011 #include <shogun/mathematics/Math.h>
00012 #include <shogun/base/Parameter.h>
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/features/StringFeatures.h>
00015 
00016 using namespace shogun;
00017 
00018 CPositionalPWM::CPositionalPWM() : CDistribution(),
00019     m_pwm_rows(0), m_pwm_cols(0), m_pwm(NULL),
00020     m_sigma(0), m_mean(0),
00021     m_w_rows(0), m_w_cols(0), m_w(NULL), m_poim(NULL)
00022 
00023 {
00024     register_params();
00025 }
00026 
00027 CPositionalPWM::~CPositionalPWM()
00028 {
00029     SG_FREE(m_pwm);
00030     SG_FREE(m_w);
00031 }
00032 
00033 bool CPositionalPWM::train(CFeatures* data)
00034 {
00035     SG_NOTIMPLEMENTED;
00036     return true;
00037 }
00038 
00039 int32_t CPositionalPWM::get_num_model_parameters()
00040 {
00041     return m_pwm_rows*m_pwm_cols+2;
00042 }
00043 
00044 float64_t CPositionalPWM::get_log_model_parameter(int32_t num_param)
00045 {
00046     ASSERT(num_param>0 && num_param<=m_pwm_rows*m_pwm_cols+2);
00047 
00048     if (num_param<m_pwm_rows*m_pwm_cols)
00049     {
00050         ASSERT(m_pwm);
00051         return m_pwm[num_param];
00052     }
00053     else if (num_param<m_pwm_rows*m_pwm_cols+1)
00054         return CMath::log(m_sigma);
00055     else
00056         return CMath::log(m_mean);
00057 }
00058 
00059 float64_t CPositionalPWM::get_log_derivative(int32_t num_param, int32_t num_example)
00060 {
00061     SG_NOTIMPLEMENTED;
00062     return 0;
00063 }
00064 
00065 float64_t CPositionalPWM::get_log_likelihood_example(int32_t num_example)
00066 {
00067     ASSERT(features);
00068     ASSERT(features->get_feature_class() == C_STRING);
00069     ASSERT(features->get_feature_type()==F_BYTE);
00070 
00071     CStringFeatures<uint8_t>* strs=(CStringFeatures<uint8_t>*) features;
00072 
00073     float64_t lik=0;
00074     int32_t len=0;
00075     bool do_free=false;
00076 
00077     uint8_t* str = strs->get_feature_vector(num_example, len, do_free);
00078 
00079     if (!(m_w && m_w_cols==len))
00080         return 0; //TODO
00081 
00082     for (int32_t i=0; i<len; i++)
00083         lik+=m_w[4*i+str[i]];
00084 
00085     strs->free_feature_vector(str, num_example, do_free);
00086     return lik;
00087 }
00088 
00089 float64_t CPositionalPWM::get_log_likelihood_window(uint8_t* window, int32_t len, float64_t pos)
00090 {
00091     ASSERT(m_pwm_cols == len);
00092     float64_t score = CMath::log(1/(m_sigma*CMath::sqrt(2*M_PI))) -
00093             CMath::sq(pos-m_mean)/(2*CMath::sq(m_sigma));
00094 
00095     for (int32_t i=0; i<m_pwm_cols; i++)
00096         score+=m_pwm[m_pwm_rows*i+window[i]];
00097 
00098     return score;
00099 }
00100 
00101 void CPositionalPWM::compute_w(int32_t num_pos)
00102 {
00103     ASSERT(m_pwm && m_pwm_rows && m_pwm_cols);
00104 
00105     m_w_rows=CMath::pow(m_pwm_rows, m_pwm_cols);
00106     m_w_cols=num_pos;
00107 
00108     SG_FREE(m_w);
00109     m_w=SG_MALLOC(float64_t, m_w_cols*m_w_rows);
00110 
00111     uint8_t* window=SG_MALLOC(uint8_t, m_pwm_cols);
00112     CMath::fill_vector(window, m_pwm_cols, (uint8_t) 0);
00113 
00114     const int32_t last_idx=m_pwm_cols-1;
00115     for (int32_t i=0; i<m_w_rows; i++)
00116     {
00117         for (int32_t j=0; j<m_w_cols; j++)
00118             m_w[j*m_w_rows+i]=get_log_likelihood_window(window, m_pwm_cols, j);
00119 
00120         window[last_idx]++;
00121         int32_t window_ptr=last_idx;
00122         while (window[window_ptr]==m_pwm_rows && window_ptr>0)
00123         {
00124             window[window_ptr]=0;
00125             window_ptr--;
00126             window[window_ptr]++;
00127         }
00128 
00129     }
00130 }
00131 
00132 void CPositionalPWM::register_params()
00133 {
00134     m_parameters->add_vector(&m_poim, &m_poim_len, "poim", "POIM Scoring Matrix");
00135     m_parameters->add_matrix(&m_w, &m_w_rows, &m_w_cols, "w", "Scoring Matrix");
00136     m_parameters->add_matrix(&m_pwm, &m_pwm_rows, &m_pwm_cols, "pwm", "Positional Weight Matrix.");
00137     m_parameters->add(&m_sigma, "sigma", "Standard Deviation.");
00138     m_parameters->add(&m_mean, "mean", "Mean.");
00139 }
00140 
00141 void CPositionalPWM::compute_scoring(int32_t max_degree)
00142 {
00143     ASSERT(m_w);
00144 
00145     int32_t num_feat=m_w_cols;
00146     int32_t num_sym=0;
00147     int32_t order=m_pwm_cols;
00148     int32_t num_words=m_pwm_cols;
00149 
00150     CAlphabet* alpha=new CAlphabet(DNA);
00151     CStringFeatures<uint16_t>* str= new CStringFeatures<uint16_t>(alpha);
00152     int32_t num_bits=alpha->get_num_bits();
00153     str->compute_symbol_mask_table(num_bits);
00154     
00155     for (int32_t i=0; i<order; i++)
00156         num_sym+=CMath::pow((int32_t) num_words,i+1);
00157 
00158     SG_FREE(m_poim);
00159     m_poim_len=num_feat*num_sym;
00160     m_poim=SG_MALLOC(float64_t, num_feat*num_sym);
00161     memset(m_poim,0, size_t(num_feat)*size_t(num_sym));
00162 
00163     uint32_t kmer_mask=0;
00164     uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
00165     int32_t offset=0;
00166 
00167     for (int32_t o=0; o<max_degree; o++)
00168     {
00169         float64_t* contrib=&m_poim[offset];
00170         offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
00171 
00172         kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
00173 
00174         for (int32_t p=-o; p<order; p++)
00175         {
00176             int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
00177             uint32_t imer_mask=kmer_mask;
00178             uint32_t jmer_mask=kmer_mask;
00179 
00180             if (p<0)
00181             {
00182                 il=-p;
00183                 m_sym=order-o-p-1;
00184                 o_sym=-p;
00185             }
00186             else if (p<order-o)
00187             {
00188                 ir=p;
00189                 m_sym=order-o-1;
00190             }
00191             else
00192             {
00193                 ir=p;
00194                 m_sym=p;
00195                 o_sym=p-order+o+1;
00196                 jl=order-ir;
00197                 imer_mask=(kmer_mask>>(num_bits*o_sym));
00198                 jmer_mask=(kmer_mask>>(num_bits*jl));
00199             }
00200 
00201             float64_t marginalizer=
00202                 1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
00203             
00204             for (uint32_t i=0; i<words; i++)
00205             {
00206                 uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
00207 
00208                 if (p>=0 && p<order-o)
00209                 {
00210                     contrib[x]+=m_w[m_w_cols*ir+i]*marginalizer;
00211                 }
00212                 else
00213                 {
00214                     for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
00215                     {
00216                         uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
00217                         contrib[c]+=m_w[m_w_cols*il+i]*marginalizer;
00218                     }
00219                 }
00220             }
00221         }
00222     }
00223 }
00224 
00225 SGMatrix<float64_t> CPositionalPWM::get_scoring(int32_t d)
00226 {
00227     int32_t offs=0;
00228     for (int32_t i=0; i<d-1; i++)
00229         offs+=CMath::pow((int32_t) m_w_rows,i+1);
00230     int32_t rows=CMath::pow((int32_t) m_w_rows,d);
00231     int32_t cols=m_w_cols;
00232     return SGMatrix<float64_t>(&m_poim[offs],rows,cols);
00233 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation