SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
OligoStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2008 Christian Igel, Tobias Glasmachers
8  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
9  *
10  * Shogun adjustments (W) 2008-2009 Soeren Sonnenburg
11  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
12  *
13  */
17 
18 #include <map>
19 #include <vector>
20 #include <algorithm>
21 
22 using namespace shogun;
23 
25  : CStringKernel<char>()
26 {
27  SG_UNSTABLE("OligoStringKernel");
28  init();
29 }
30 
31 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
32 : CStringKernel<char>(cache_sz)
33 {
34  SG_UNSTABLE("OligoStringKernel");
35  init();
36 
37  k=kmer_len;
38  width=w;
39 }
40 
42 {
43  cleanup();
44 }
45 
47 {
49  gauss_table=NULL;
51 
53 }
54 
55 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
56 {
57  cleanup();
58 
60  int32_t max_len=CMath::max(
61  ((CStringFeatures<char>*) l)->get_max_vector_length(),
62  ((CStringFeatures<char>*) r)->get_max_vector_length()
63  );
64 
65  getExpFunctionCache(max_len);
66  return init_normalizer();
67 }
68 
70  const std::string& sequence, uint32_t k_mer_length,
71  const std::string& allowed_characters,
72  std::vector< std::pair<int32_t, float64_t> >& values)
73 {
74  float64_t oligo_value = 0.;
75  float64_t factor = 1.;
76  std::map<std::string::value_type, uint32_t> residue_values;
77  uint32_t counter = 0;
78  uint32_t number_of_residues = allowed_characters.size();
79  uint32_t sequence_length = sequence.size();
80  bool sequence_ok = true;
81 
82  // checking if sequence contains illegal characters
83  for (uint32_t i = 0; i < sequence.size(); ++i)
84  {
85  if (allowed_characters.find(sequence.at(i)) == std::string::npos)
86  sequence_ok = false;
87  }
88 
89  if (sequence_ok && k_mer_length <= sequence_length)
90  {
91  values.resize(sequence_length - k_mer_length + 1,
92  std::pair<int32_t, float64_t>());
93  for (uint32_t i = 0; i < number_of_residues; ++i)
94  {
95  residue_values.insert(std::make_pair(allowed_characters[i], counter));
96  ++counter;
97  }
98  for (int32_t k = k_mer_length - 1; k >= 0; k--)
99  {
100  oligo_value += factor * residue_values[sequence[k]];
101  factor *= number_of_residues;
102  }
103  factor /= number_of_residues;
104  counter = 0;
105  values[counter].first = 1;
106  values[counter].second = oligo_value;
107  ++counter;
108 
109  for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
110  {
111  oligo_value -= factor * residue_values[sequence[j - 1]];
112  oligo_value = oligo_value * number_of_residues +
113  residue_values[sequence[j + k_mer_length - 1]];
114 
115  values[counter].first = j + 1;
116  values[counter].second = oligo_value ;
117  ++counter;
118  }
119  stable_sort(values.begin(), values.end(), cmpOligos_);
120  }
121  else
122  {
123  values.clear();
124  }
125 }
126 
128  const std::vector<std::string>& sequences, uint32_t k_mer_length,
129  const std::string& allowed_characters,
130  std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
131 {
132  std::vector< std::pair<int32_t, float64_t> > temp_vector;
133  encoded_sequences.resize(sequences.size(),
134  std::vector< std::pair<int32_t, float64_t> >());
135 
136  for (uint32_t i = 0; i < sequences.size(); ++i)
137  {
138  encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
139  encoded_sequences[i] = temp_vector;
140  }
141 }
142 
143 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
144 {
146  gauss_table=SG_MALLOC(float64_t, sequence_length);
147 
148  gauss_table[0] = 1;
149  for (uint32_t i = 1; i < sequence_length; i++)
150  gauss_table[i] = exp((-1.0 / (CMath::sq(width))) * CMath::sq((float64_t) i));
151 
152  gauss_table_len=sequence_length;
153 }
154 
156  const std::vector< std::pair<int32_t, float64_t> >& x,
157  const std::vector< std::pair<int32_t, float64_t> >& y,
158  int32_t max_distance)
159 {
160  float64_t result = 0;
161  int32_t i1 = 0;
162  int32_t i2 = 0;
163  int32_t c1 = 0;
164  uint32_t x_size = x.size();
165  uint32_t y_size = y.size();
166 
167  while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
168  {
169  if (x[i1].second == y[i2].second)
170  {
171  if (max_distance < 0
172  || (abs(x[i1].first - y[i2].first)) <= max_distance)
173  {
174  result += gauss_table[abs((x[i1].first - y[i2].first))];
175  if (x[i1].second == x[i1 + 1].second)
176  {
177  i1++;
178  c1++;
179  }
180  else if (y[i2].second == y[i2 + 1].second)
181  {
182  i2++;
183  i1 -= c1;
184  c1 = 0;
185  }
186  else
187  {
188  i1++;
189  i2++;
190  }
191  }
192  else
193  {
194  if (x[i1].first < y[i2].first)
195  {
196  if (x[i1].second == x[i1 + 1].second)
197  {
198  i1++;
199  }
200  else if (y[i2].second == y[i2 + 1].second)
201  {
202  while (y[i2].second == y[i2+1].second)
203  i2++;
204  ++i1;
205  c1 = 0;
206  }
207  else
208  {
209  i1++;
210  i2++;
211  c1 = 0;
212  }
213  }
214  else
215  {
216  i2++;
217  i1 -= c1;
218  c1 = 0;
219  }
220  }
221  }
222  else
223  {
224  if (x[i1].second < y[i2].second)
225  i1++;
226  else
227  i2++;
228  c1 = 0;
229  }
230  }
231  return result;
232 }
233 
234 
235 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
236 {
237  int32_t alen, blen;
238  bool free_a, free_b;
239  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
240  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
241  std::vector< std::pair<int32_t, float64_t> > aenc;
242  std::vector< std::pair<int32_t, float64_t> > benc;
243  encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
244  encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
245  float64_t result=kernelOligoFast(aenc, benc);
246  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
247  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
248  return result;
249 }
250 
251 void COligoStringKernel::init()
252 {
253  k=0;
254  width=0.0;
255  gauss_table=NULL;
256  gauss_table_len=0;
257 
259 
260  SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE);
261  SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE);
263  "Gauss Cache Table.");
264 }

SHOGUN Machine Learning Toolbox - Documentation