SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
OligoStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2008 Christian Igel, Tobias Glasmachers
8  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
9  *
10  * Shogun adjustments (W) 2008-2009,2013 Soeren Sonnenburg
11  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
12  * Copyright (C) 2013 Soeren Sonnenburg
13  *
14  */
18 
19 #include <map>
20 #include <vector>
21 #include <algorithm>
22 
23 using namespace shogun;
24 
26  : CStringKernel<char>()
27 {
28  init();
29 }
30 
31 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
32 : CStringKernel<char>(cache_sz)
33 {
34  init();
35 
36  k=kmer_len;
37  width=w;
38 }
39 
42  int32_t kmer_len, float64_t w)
43 : CStringKernel<char>()
44 {
45  init();
46 
47  k=kmer_len;
48  width=w;
49 
50  init(l, r);
51 }
52 
54 {
55  cleanup();
56 }
57 
59 {
62 }
63 
64 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
65 {
66  cleanup();
67 
69  int32_t max_len=CMath::max(
70  ((CStringFeatures<char>*) l)->get_max_vector_length(),
71  ((CStringFeatures<char>*) r)->get_max_vector_length()
72  );
73 
74  REQUIRE(k>0, "k must be >0\n")
75  REQUIRE(width>0, "width must be >0\n")
76 
77  getExpFunctionCache(max_len);
78  return init_normalizer();
79 }
80 
82  const std::string& sequence, uint32_t k_mer_length,
83  const std::string& allowed_characters,
84  std::vector< std::pair<int32_t, float64_t> >& values)
85 {
86  float64_t oligo_value = 0.;
87  float64_t factor = 1.;
88  std::map<std::string::value_type, uint32_t> residue_values;
89  uint32_t counter = 0;
90  uint32_t number_of_residues = allowed_characters.size();
91  uint32_t sequence_length = sequence.size();
92  bool sequence_ok = true;
93 
94  // checking if sequence contains illegal characters
95  for (uint32_t i = 0; i < sequence.size(); ++i)
96  {
97  if (allowed_characters.find(sequence.at(i)) == std::string::npos)
98  sequence_ok = false;
99  }
100 
101  if (sequence_ok && k_mer_length <= sequence_length)
102  {
103  values.resize(sequence_length - k_mer_length + 1,
104  std::pair<int32_t, float64_t>());
105  for (uint32_t i = 0; i < number_of_residues; ++i)
106  {
107  residue_values.insert(std::make_pair(allowed_characters[i], counter));
108  ++counter;
109  }
110  for (int32_t k = k_mer_length - 1; k >= 0; k--)
111  {
112  oligo_value += factor * residue_values[sequence[k]];
113  factor *= number_of_residues;
114  }
115  factor /= number_of_residues;
116  counter = 0;
117  values[counter].first = 1;
118  values[counter].second = oligo_value;
119  ++counter;
120 
121  for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
122  {
123  oligo_value -= factor * residue_values[sequence[j - 1]];
124  oligo_value = oligo_value * number_of_residues +
125  residue_values[sequence[j + k_mer_length - 1]];
126 
127  values[counter].first = j + 1;
128  values[counter].second = oligo_value ;
129  ++counter;
130  }
131  stable_sort(values.begin(), values.end(), cmpOligos_);
132  }
133  else
134  {
135  values.clear();
136  }
137 }
138 
140  const std::vector<std::string>& sequences, uint32_t k_mer_length,
141  const std::string& allowed_characters,
142  std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
143 {
144  std::vector< std::pair<int32_t, float64_t> > temp_vector;
145  encoded_sequences.resize(sequences.size(),
146  std::vector< std::pair<int32_t, float64_t> >());
147 
148  for (uint32_t i = 0; i < sequences.size(); ++i)
149  {
150  encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
151  encoded_sequences[i] = temp_vector;
152  }
153 }
154 
155 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
156 {
157  gauss_table=SGVector<float64_t>(sequence_length);
158 
159  gauss_table[0] = 1;
160  for (uint32_t i = 1; i < sequence_length; i++)
161  gauss_table[i] = exp(-CMath::sq((float64_t) i) / width);
162 }
163 
165  const std::vector< std::pair<int32_t, float64_t> >& x,
166  const std::vector< std::pair<int32_t, float64_t> >& y,
167  int32_t max_distance)
168 {
169  float64_t result = 0;
170  int32_t i1 = 0;
171  int32_t i2 = 0;
172  int32_t c1 = 0;
173  uint32_t x_size = x.size();
174  uint32_t y_size = y.size();
175 
176  while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
177  {
178  if (x[i1].second == y[i2].second)
179  {
180  if (max_distance < 0
181  || (abs(x[i1].first - y[i2].first)) <= max_distance)
182  {
183  result += gauss_table[abs((x[i1].first - y[i2].first))];
184  if (x[i1].second == x[i1 + 1].second)
185  {
186  i1++;
187  c1++;
188  }
189  else if (y[i2].second == y[i2 + 1].second)
190  {
191  i2++;
192  i1 -= c1;
193  c1 = 0;
194  }
195  else
196  {
197  i1++;
198  i2++;
199  }
200  }
201  else
202  {
203  if (x[i1].first < y[i2].first)
204  {
205  if (x[i1].second == x[i1 + 1].second)
206  {
207  i1++;
208  }
209  else if (y[i2].second == y[i2 + 1].second)
210  {
211  while (y[i2].second == y[i2].second)
212  i2++;
213  ++i1;
214  c1 = 0;
215  }
216  else
217  {
218  i1++;
219  i2++;
220  c1 = 0;
221  }
222  }
223  else
224  {
225  i2++;
226  i1 -= c1;
227  c1 = 0;
228  }
229  }
230  }
231  else
232  {
233  if (x[i1].second < y[i2].second)
234  i1++;
235  else
236  i2++;
237  c1 = 0;
238  }
239  }
240  return result;
241 }
242 
244  const std::vector< std::pair<int32_t, float64_t> >& x,
245  const std::vector< std::pair<int32_t, float64_t> >& y)
246 {
247  float64_t result = 0;
248  int32_t i1 = 0;
249  int32_t i2 = 0;
250  int32_t c1 = 0;
251  uint32_t x_size = x.size();
252  uint32_t y_size = y.size();
253 
254  while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
255  {
256  if (x[i1].second == y[i2].second)
257  {
258  result += exp(-CMath::sq(x[i1].first - y[i2].first) / width);
259 
260  if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second)
261  {
262  i1++;
263  c1++;
264  }
265  else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second)
266  {
267  i2++;
268  i1 -= c1;
269  c1 = 0;
270  }
271  else
272  {
273  i1++;
274  i2++;
275  }
276  }
277  else
278  {
279  if (x[i1].second < y[i2].second)
280  i1++;
281  else
282  i2++;
283 
284  c1 = 0;
285  }
286  }
287  return result;
288 }
289 
290 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
291 {
292  int32_t alen, blen;
293  bool free_a, free_b;
294  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
295  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
296  std::vector< std::pair<int32_t, float64_t> > aenc;
297  std::vector< std::pair<int32_t, float64_t> > benc;
298  encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
299  encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
300  //float64_t result=kernelOligo(aenc, benc);
301  float64_t result=kernelOligoFast(aenc, benc);
302  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
303  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
304  return result;
305 }
306 
307 void COligoStringKernel::init()
308 {
309  k=0;
310  width=0.0;
311 
313 
314  SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE);
315  SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE);
316  SG_ADD(&gauss_table, "gauss_table", "Gauss Cache Table.", MS_NOT_AVAILABLE);
317 }
virtual void cleanup()
Definition: Kernel.cpp:173
virtual bool init(CFeatures *l, CFeatures *r)
float64_t kernelOligo(const std::vector< std::pair< int32_t, float64_t > > &x, const std::vector< std::pair< int32_t, float64_t > > &y)
returns the value of the oligo kernel for sequences 'x' and 'y'
virtual float64_t compute(int32_t x, int32_t y)
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:150
static T sq(T x)
Definition: Math.h:450
SGVector< float64_t > gauss_table
#define REQUIRE(x,...)
Definition: SGIO.h:206
static void encodeOligo(const std::string &sequence, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::pair< int32_t, float64_t > > &values)
encodes the signals of the sequence
double float64_t
Definition: common.h:50
static T max(T a, T b)
Definition: Math.h:168
static void getSequences(const std::vector< std::string > &sequences, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::vector< std::pair< int32_t, float64_t > > > &encoded_sequences)
encodes all sequences with the encodeOligo function and stores them in 'encoded_sequences' ...
virtual bool init_normalizer()
Definition: Kernel.cpp:168
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1062
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1060
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_ADD(...)
Definition: SGObject.h:84
friend class CSqrtDiagKernelNormalizer
Definition: Kernel.h:162
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
float64_t kernelOligoFast(const std::vector< std::pair< int32_t, float64_t > > &x, const std::vector< std::pair< int32_t, float64_t > > &y, int32_t max_distance=-1)
returns the value of the oligo kernel for sequences 'x' and 'y'

SHOGUN Machine Learning Toolbox - Documentation