00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/lib/common.h>
00012 #include <shogun/kernel/string/RegulatoryModulesStringKernel.h>
00013 #include <shogun/features/Features.h>
00014 #include <shogun/io/SGIO.h>
00015
00016 using namespace shogun;
00017
00018 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel()
00019 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0),
00020 motif_positions_lhs(NULL), motif_positions_rhs(NULL),
00021 position_weights(NULL), weights(NULL)
00022 {
00023 register_params();
00024 }
00025
00026 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(
00027 int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
00028 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00029 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00030 {
00031 register_params();
00032 }
00033
00034 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr,
00035 CDenseFeatures<uint16_t>* lpos, CDenseFeatures<uint16_t>* rpos,
00036 float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
00037 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
00038 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
00039 {
00040 set_motif_positions(lpos, rpos);
00041 init(lstr,rstr);
00042 register_params();
00043 }
00044
00045 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel()
00046 {
00047 SG_UNREF(motif_positions_lhs);
00048 SG_UNREF(motif_positions_rhs);
00049 }
00050
00051 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
00052 {
00053 ASSERT(motif_positions_lhs);
00054 ASSERT(motif_positions_rhs);
00055
00056 if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors())
00057 SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
00058 l->get_num_vectors(), motif_positions_lhs->get_num_vectors());
00059 if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors())
00060 SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
00061 r->get_num_vectors(), motif_positions_rhs->get_num_vectors());
00062
00063 set_wd_weights();
00064 CStringKernel<char>::init(l, r);
00065 return init_normalizer();
00066 }
00067
00068 void CRegulatoryModulesStringKernel::set_motif_positions(
00069 CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs)
00070 {
00071 ASSERT(positions_lhs);
00072 ASSERT(positions_rhs);
00073 SG_UNREF(motif_positions_lhs);
00074 SG_UNREF(motif_positions_rhs);
00075 if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
00076 SG_ERROR("Number of dimensions does not agree.\n");
00077
00078 motif_positions_lhs=positions_lhs;
00079 motif_positions_rhs=positions_rhs;
00080 SG_REF(positions_lhs);
00081 SG_REF(positions_rhs);
00082 }
00083
00084 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b)
00085 {
00086 ASSERT(motif_positions_lhs);
00087 ASSERT(motif_positions_rhs);
00088
00089 bool free_avec, free_bvec;
00090 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00091 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00092
00093 int32_t alen_pos, blen_pos;
00094 bool afree_pos, bfree_pos;
00095 uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
00096 uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
00097 ASSERT(alen_pos==blen_pos);
00098 int32_t num_pos=alen_pos;
00099
00100
00101 float64_t result_rbf=0;
00102 float64_t result_wds=0;
00103
00104 for (int32_t p=0; p<num_pos; p++)
00105 {
00106 result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
00107
00108 for (int32_t p2=0; p2<num_pos; p2++)
00109 result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
00110
00111 int32_t limit = window;
00112 if (window + positions_a[p] > alen)
00113 limit = alen - positions_a[p];
00114
00115 if (window + positions_b[p] > blen)
00116 limit = CMath::min(limit, blen - positions_b[p]);
00117
00118 result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
00119 limit);
00120 }
00121
00122 float64_t result=exp(-result_rbf/width)+result_wds;
00123
00124 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00125 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00126 ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
00127 ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
00128
00129 return result;
00130 }
00131
00132 float64_t CRegulatoryModulesStringKernel::compute_wds(
00133 char* avec, char* bvec, int32_t len)
00134 {
00135 float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
00136 float64_t sum0=0 ;
00137 for (int32_t i=0; i<shift; i++)
00138 max_shift_vec[i]=0 ;
00139
00140
00141 for (int32_t i=0; i<len; i++)
00142 {
00143 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00144 continue ;
00145
00146 float64_t sumi = 0.0 ;
00147 for (int32_t j=0; (j<degree) && (i+j<len); j++)
00148 {
00149 if (avec[i+j]!=bvec[i+j])
00150 break ;
00151 sumi += weights[j];
00152 }
00153 if (position_weights!=NULL)
00154 sum0 += position_weights[i]*sumi ;
00155 else
00156 sum0 += sumi ;
00157 } ;
00158
00159 for (int32_t i=0; i<len; i++)
00160 {
00161 for (int32_t k=1; (k<=shift) && (i+k<len); k++)
00162 {
00163 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00164 continue ;
00165
00166 float64_t sumi1 = 0.0 ;
00167
00168 for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00169 {
00170 if (avec[i+j+k]!=bvec[i+j])
00171 break ;
00172 sumi1 += weights[j];
00173 }
00174 float64_t sumi2 = 0.0 ;
00175
00176 for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00177 {
00178 if (avec[i+j]!=bvec[i+j+k])
00179 break ;
00180 sumi2 += weights[j];
00181 }
00182 if (position_weights!=NULL)
00183 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00184 else
00185 max_shift_vec[k-1] += sumi1 + sumi2 ;
00186 } ;
00187 }
00188
00189 float64_t result = sum0 ;
00190 for (int32_t i=0; i<shift; i++)
00191 result += max_shift_vec[i]/(2*(i+1)) ;
00192
00193 SG_FREE(max_shift_vec);
00194 return result ;
00195 }
00196
00197 void CRegulatoryModulesStringKernel::set_wd_weights()
00198 {
00199 ASSERT(degree>0);
00200
00201 SG_FREE(weights);
00202 weights=SG_MALLOC(float64_t, degree);
00203
00204 int32_t i;
00205 float64_t sum=0;
00206 for (i=0; i<degree; i++)
00207 {
00208 weights[i]=degree-i;
00209 sum+=weights[i];
00210 }
00211
00212 for (i=0; i<degree; i++)
00213 weights[i]/=sum;
00214 }
00215
00216 void CRegulatoryModulesStringKernel::register_params()
00217 {
00218 SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE);
00219 SG_ADD(°ree, "degree", "the degree of weighted degree kernel part",
00220 MS_AVAILABLE);
00221 SG_ADD(&shift, "shift",
00222 "the shift of weighted degree with shifts kernel part", MS_AVAILABLE);
00223 SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE);
00224 m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side");
00225 m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side");
00226 m_parameters->add_vector(&position_weights, °ree, "position_weights", "scaling weights in window");
00227 m_parameters->add_vector(&weights, °ree, "weights", "weights of WD kernel");
00228 }