SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RegulatoryModulesStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg
8  * Copyright (C) 2009 Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
14 #include <shogun/io/SGIO.h>
15 
16 using namespace shogun;
17 
19 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0),
20  motif_positions_lhs(NULL), motif_positions_rhs(NULL),
21  position_weights(NULL), weights(NULL)
22 {
24 }
25 
27  int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
28 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
29  motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
30 {
32 }
33 
36  float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
37 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
38  motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
39 {
40  set_motif_positions(lpos, rpos);
41  init(lstr,rstr);
43 }
44 
46 {
49 }
50 
51 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
52 {
55 
57  SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
60  SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
62 
65  return init_normalizer();
66 }
67 
69  CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs)
70 {
71  ASSERT(positions_lhs);
72  ASSERT(positions_rhs);
75  if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
76  SG_ERROR("Number of dimensions does not agree.\n");
77 
78  motif_positions_lhs=positions_lhs;
79  motif_positions_rhs=positions_rhs;
80  SG_REF(positions_lhs);
81  SG_REF(positions_rhs);
82 }
83 
85 {
88 
89  bool free_avec, free_bvec;
90  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
91  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
92 
93  int32_t alen_pos, blen_pos;
94  bool afree_pos, bfree_pos;
95  uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
96  uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
97  ASSERT(alen_pos==blen_pos);
98  int32_t num_pos=alen_pos;
99 
100 
101  float64_t result_rbf=0;
102  float64_t result_wds=0;
103 
104  for (int32_t p=0; p<num_pos; p++)
105  {
106  result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
107 
108  for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2
109  result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
110 
111  int32_t limit = window;
112  if (window + positions_a[p] > alen)
113  limit = alen - positions_a[p];
114 
115  if (window + positions_b[p] > blen)
116  limit = CMath::min(limit, blen - positions_b[p]);
117 
118  result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
119  limit);
120  }
121 
122  float64_t result=exp(-result_rbf/width)+result_wds;
123 
124  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
125  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
126  ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
127  ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
128 
129  return result;
130 }
131 
133  char* avec, char* bvec, int32_t len)
134 {
135  float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
136  float64_t sum0=0 ;
137  for (int32_t i=0; i<shift; i++)
138  max_shift_vec[i]=0 ;
139 
140  // no shift
141  for (int32_t i=0; i<len; i++)
142  {
143  if ((position_weights!=NULL) && (position_weights[i]==0.0))
144  continue ;
145 
146  float64_t sumi = 0.0 ;
147  for (int32_t j=0; (j<degree) && (i+j<len); j++)
148  {
149  if (avec[i+j]!=bvec[i+j])
150  break ;
151  sumi += weights[j];
152  }
153  if (position_weights!=NULL)
154  sum0 += position_weights[i]*sumi ;
155  else
156  sum0 += sumi ;
157  } ;
158 
159  for (int32_t i=0; i<len; i++)
160  {
161  for (int32_t k=1; (k<=shift) && (i+k<len); k++)
162  {
163  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
164  continue ;
165 
166  float64_t sumi1 = 0.0 ;
167  // shift in sequence a
168  for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
169  {
170  if (avec[i+j+k]!=bvec[i+j])
171  break ;
172  sumi1 += weights[j];
173  }
174  float64_t sumi2 = 0.0 ;
175  // shift in sequence b
176  for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
177  {
178  if (avec[i+j]!=bvec[i+j+k])
179  break ;
180  sumi2 += weights[j];
181  }
182  if (position_weights!=NULL)
183  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
184  else
185  max_shift_vec[k-1] += sumi1 + sumi2 ;
186  } ;
187  }
188 
189  float64_t result = sum0 ;
190  for (int32_t i=0; i<shift; i++)
191  result += max_shift_vec[i]/(2*(i+1)) ;
192 
193  SG_FREE(max_shift_vec);
194  return result ;
195 }
196 
198 {
199  ASSERT(degree>0);
200 
201  SG_FREE(weights);
203 
204  int32_t i;
205  float64_t sum=0;
206  for (i=0; i<degree; i++)
207  {
208  weights[i]=degree-i;
209  sum+=weights[i];
210  }
211 
212  for (i=0; i<degree; i++)
213  weights[i]/=sum;
214 }
215 
217 {
218  SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE);
219  SG_ADD(&degree, "degree", "the degree of weighted degree kernel part",
220  MS_AVAILABLE);
221  SG_ADD(&shift, "shift",
222  "the shift of weighted degree with shifts kernel part", MS_AVAILABLE);
223  SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE);
224  m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side");
225  m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side");
226  m_parameters->add_vector(&position_weights, &degree, "position_weights", "scaling weights in window");
227  m_parameters->add_vector(&weights, &degree, "weights", "weights of WD kernel");
228 }

SHOGUN Machine Learning Toolbox - Documentation