RegulatoryModulesStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg
00008  * Copyright (C) 2009 Max-Planck-Society
00009  */
00010 
00011 #include <shogun/lib/common.h>
00012 #include <shogun/kernel/string/RegulatoryModulesStringKernel.h>
00013 #include <shogun/features/Features.h>
00014 #include <shogun/io/SGIO.h>
00015 
00016 using namespace shogun;
00017 
00018 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel()
00019 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0),
00020     motif_positions_lhs(NULL), motif_positions_rhs(NULL),
00021   position_weights(NULL), weights(NULL)
00022 {
00023     register_params();
00024 }
00025 
00026 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(
00027         int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
00028 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00029     motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00030 {
00031     register_params();
00032 }
00033 
00034 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr,
00035         CDenseFeatures<uint16_t>* lpos, CDenseFeatures<uint16_t>* rpos,
00036         float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
00037 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00038     motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00039 {
00040     set_motif_positions(lpos, rpos);
00041     init(lstr,rstr);
00042     register_params();
00043 }
00044 
00045 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel()
00046 {
00047     SG_UNREF(motif_positions_lhs);
00048     SG_UNREF(motif_positions_rhs);
00049 }
00050 
00051 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
00052 {
00053     ASSERT(motif_positions_lhs);
00054     ASSERT(motif_positions_rhs);
00055 
00056     if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors())
00057         SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
00058                 l->get_num_vectors(),  motif_positions_lhs->get_num_vectors());
00059     if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors())
00060         SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
00061                 r->get_num_vectors(), motif_positions_rhs->get_num_vectors());
00062 
00063     set_wd_weights();
00064     CStringKernel<char>::init(l, r);
00065     return init_normalizer();
00066 }
00067 
00068 void CRegulatoryModulesStringKernel::set_motif_positions(
00069         CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs)
00070 {
00071     ASSERT(positions_lhs);
00072     ASSERT(positions_rhs);
00073     SG_UNREF(motif_positions_lhs);
00074     SG_UNREF(motif_positions_rhs);
00075     if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
00076         SG_ERROR("Number of dimensions does not agree.\n");
00077 
00078     motif_positions_lhs=positions_lhs;
00079     motif_positions_rhs=positions_rhs;
00080     SG_REF(positions_lhs);
00081     SG_REF(positions_rhs);
00082 }
00083 
00084 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b)
00085 {
00086     ASSERT(motif_positions_lhs);
00087     ASSERT(motif_positions_rhs);
00088 
00089     bool free_avec, free_bvec;
00090     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00091     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00092 
00093     int32_t alen_pos, blen_pos;
00094     bool afree_pos, bfree_pos;
00095     uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
00096     uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
00097     ASSERT(alen_pos==blen_pos);
00098     int32_t num_pos=alen_pos;
00099 
00100 
00101     float64_t result_rbf=0;
00102     float64_t result_wds=0;
00103 
00104     for (int32_t p=0; p<num_pos; p++)
00105     {
00106         result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
00107 
00108         for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2
00109             result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
00110 
00111         int32_t limit = window;
00112         if (window + positions_a[p] > alen)
00113             limit = alen - positions_a[p];
00114 
00115         if (window + positions_b[p] > blen)
00116             limit = CMath::min(limit, blen - positions_b[p]);
00117 
00118         result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
00119                 limit);
00120     }
00121 
00122     float64_t result=exp(-result_rbf/width)+result_wds;
00123 
00124     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00125     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00126     ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
00127     ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
00128 
00129     return result;
00130 }
00131 
00132 float64_t CRegulatoryModulesStringKernel::compute_wds(
00133     char* avec, char* bvec, int32_t len)
00134 {
00135     float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
00136     float64_t sum0=0 ;
00137     for (int32_t i=0; i<shift; i++)
00138         max_shift_vec[i]=0 ;
00139 
00140     // no shift
00141     for (int32_t i=0; i<len; i++)
00142     {
00143         if ((position_weights!=NULL) && (position_weights[i]==0.0))
00144             continue ;
00145 
00146         float64_t sumi = 0.0 ;
00147         for (int32_t j=0; (j<degree) && (i+j<len); j++)
00148         {
00149             if (avec[i+j]!=bvec[i+j])
00150                 break ;
00151             sumi += weights[j];
00152         }
00153         if (position_weights!=NULL)
00154             sum0 += position_weights[i]*sumi ;
00155         else
00156             sum0 += sumi ;
00157     } ;
00158 
00159     for (int32_t i=0; i<len; i++)
00160     {
00161         for (int32_t k=1; (k<=shift) && (i+k<len); k++)
00162         {
00163             if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00164                 continue ;
00165 
00166             float64_t sumi1 = 0.0 ;
00167             // shift in sequence a
00168             for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00169             {
00170                 if (avec[i+j+k]!=bvec[i+j])
00171                     break ;
00172                 sumi1 += weights[j];
00173             }
00174             float64_t sumi2 = 0.0 ;
00175             // shift in sequence b
00176             for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00177             {
00178                 if (avec[i+j]!=bvec[i+j+k])
00179                     break ;
00180                 sumi2 += weights[j];
00181             }
00182             if (position_weights!=NULL)
00183                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00184             else
00185                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00186         } ;
00187     }
00188 
00189     float64_t result = sum0 ;
00190     for (int32_t i=0; i<shift; i++)
00191         result += max_shift_vec[i]/(2*(i+1)) ;
00192 
00193     SG_FREE(max_shift_vec);
00194     return result ;
00195 }
00196 
00197 void CRegulatoryModulesStringKernel::set_wd_weights()
00198 {
00199     ASSERT(degree>0);
00200 
00201     SG_FREE(weights);
00202     weights=SG_MALLOC(float64_t, degree);
00203 
00204     int32_t i;
00205     float64_t sum=0;
00206     for (i=0; i<degree; i++)
00207     {
00208         weights[i]=degree-i;
00209         sum+=weights[i];
00210     }
00211 
00212     for (i=0; i<degree; i++)
00213         weights[i]/=sum;
00214 }
00215 
00216 void CRegulatoryModulesStringKernel::register_params()
00217 {
00218     SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE);
00219     SG_ADD(&degree, "degree", "the degree of weighted degree kernel part",
00220         MS_AVAILABLE);
00221     SG_ADD(&shift, "shift",
00222         "the shift of weighted degree with shifts kernel part", MS_AVAILABLE);
00223     SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE);
00224     m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side");
00225     m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side");
00226     m_parameters->add_vector(&position_weights, &degree, "position_weights", "scaling weights in window");
00227     m_parameters->add_vector(&weights, &degree, "weights", "weights of WD kernel");
00228 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation