00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/lib/common.h>
00012 #include <shogun/kernel/RegulatoryModulesStringKernel.h>
00013 #include <shogun/features/Features.h>
00014 #include <shogun/features/SimpleFeatures.h>
00015 #include <shogun/io/SGIO.h>
00016
00017 using namespace shogun;
00018
00019 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel()
00020 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0),
00021 motif_positions_lhs(NULL), motif_positions_rhs(NULL),
00022 position_weights(NULL), weights(NULL)
00023 {
00024 register_params();
00025 }
00026
00027 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(
00028 int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
00029 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00030 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00031 {
00032 register_params();
00033 }
00034
00035 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr,
00036 CSimpleFeatures<uint16_t>* lpos, CSimpleFeatures<uint16_t>* rpos,
00037 float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
00038 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00039 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00040 {
00041 set_motif_positions(lpos, rpos);
00042 init(lstr,rstr);
00043 register_params();
00044 }
00045
00046 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel()
00047 {
00048 SG_UNREF(motif_positions_lhs);
00049 SG_UNREF(motif_positions_rhs);
00050 }
00051
00052 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
00053 {
00054 ASSERT(motif_positions_lhs);
00055 ASSERT(motif_positions_rhs);
00056
00057 if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors())
00058 SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
00059 l->get_num_vectors(), motif_positions_lhs->get_num_vectors());
00060 if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors())
00061 SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
00062 r->get_num_vectors(), motif_positions_rhs->get_num_vectors());
00063
00064 set_wd_weights();
00065 CStringKernel<char>::init(l, r);
00066 return init_normalizer();
00067 }
00068
00069 void CRegulatoryModulesStringKernel::set_motif_positions(
00070 CSimpleFeatures<uint16_t>* positions_lhs, CSimpleFeatures<uint16_t>* positions_rhs)
00071 {
00072 ASSERT(positions_lhs);
00073 ASSERT(positions_rhs);
00074 SG_UNREF(motif_positions_lhs);
00075 SG_UNREF(motif_positions_rhs);
00076 if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
00077 SG_ERROR("Number of dimensions does not agree.\n");
00078
00079 motif_positions_lhs=positions_lhs;
00080 motif_positions_rhs=positions_rhs;
00081 SG_REF(positions_lhs);
00082 SG_REF(positions_rhs);
00083 }
00084
00085 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b)
00086 {
00087 ASSERT(motif_positions_lhs);
00088 ASSERT(motif_positions_rhs);
00089
00090 bool free_avec, free_bvec;
00091 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00092 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00093
00094 int32_t alen_pos, blen_pos;
00095 bool afree_pos, bfree_pos;
00096 uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
00097 uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
00098 ASSERT(alen_pos==blen_pos);
00099 int32_t num_pos=alen_pos;
00100
00101
00102 float64_t result_rbf=0;
00103 float64_t result_wds=0;
00104
00105 for (int32_t p=0; p<num_pos; p++)
00106 {
00107 result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
00108
00109 for (int32_t p2=0; p2<num_pos; p2++)
00110 result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
00111
00112 int32_t limit = window;
00113 if (window + positions_a[p] > alen)
00114 limit = alen - positions_a[p];
00115
00116 if (window + positions_b[p] > blen)
00117 limit = CMath::min(limit, blen - positions_b[p]);
00118
00119 result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
00120 limit);
00121 }
00122
00123 float64_t result=exp(-result_rbf/width)+result_wds;
00124
00125 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00126 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00127 ((CSimpleFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
00128 ((CSimpleFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
00129
00130 return result;
00131 }
00132
00133 float64_t CRegulatoryModulesStringKernel::compute_wds(
00134 char* avec, char* bvec, int32_t len)
00135 {
00136 float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
00137 float64_t sum0=0 ;
00138 for (int32_t i=0; i<shift; i++)
00139 max_shift_vec[i]=0 ;
00140
00141
00142 for (int32_t i=0; i<len; i++)
00143 {
00144 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00145 continue ;
00146
00147 float64_t sumi = 0.0 ;
00148 for (int32_t j=0; (j<degree) && (i+j<len); j++)
00149 {
00150 if (avec[i+j]!=bvec[i+j])
00151 break ;
00152 sumi += weights[j];
00153 }
00154 if (position_weights!=NULL)
00155 sum0 += position_weights[i]*sumi ;
00156 else
00157 sum0 += sumi ;
00158 } ;
00159
00160 for (int32_t i=0; i<len; i++)
00161 {
00162 for (int32_t k=1; (k<=shift) && (i+k<len); k++)
00163 {
00164 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00165 continue ;
00166
00167 float64_t sumi1 = 0.0 ;
00168
00169 for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00170 {
00171 if (avec[i+j+k]!=bvec[i+j])
00172 break ;
00173 sumi1 += weights[j];
00174 }
00175 float64_t sumi2 = 0.0 ;
00176
00177 for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00178 {
00179 if (avec[i+j]!=bvec[i+j+k])
00180 break ;
00181 sumi2 += weights[j];
00182 }
00183 if (position_weights!=NULL)
00184 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00185 else
00186 max_shift_vec[k-1] += sumi1 + sumi2 ;
00187 } ;
00188 }
00189
00190 float64_t result = sum0 ;
00191 for (int32_t i=0; i<shift; i++)
00192 result += max_shift_vec[i]/(2*(i+1)) ;
00193
00194 SG_FREE(max_shift_vec);
00195 return result ;
00196 }
00197
00198 void CRegulatoryModulesStringKernel::set_wd_weights()
00199 {
00200 ASSERT(degree>0);
00201
00202 SG_FREE(weights);
00203 weights=SG_MALLOC(float64_t, degree);
00204
00205 int32_t i;
00206 float64_t sum=0;
00207 for (i=0; i<degree; i++)
00208 {
00209 weights[i]=degree-i;
00210 sum+=weights[i];
00211 }
00212
00213 for (i=0; i<degree; i++)
00214 weights[i]/=sum;
00215 }
00216
00217 void CRegulatoryModulesStringKernel::register_params()
00218 {
00219 m_parameters->add(&width, "width", "the width of Gaussian kernel part");
00220 m_parameters->add(°ree, "degree", "the degree of weighted degree kernel part");
00221 m_parameters->add(&shift, "shift", "the shift of weighted degree with shifts kernel part");
00222 m_parameters->add(&window, "window", "the size of window around motifs");
00223 m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side");
00224 m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side");
00225 m_parameters->add_vector(&position_weights, °ree, "position_weights", "scaling weights in window");
00226 m_parameters->add_vector(&weights, °ree, "weights", "weights of WD kernel");
00227 }