SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SpectrumMismatchRBFKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <vector>
13 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/lib/Trie.h>
18 #include <shogun/base/Parallel.h>
19 
23 
24 
25 #include <vector>
26 #include <string>
27 
28 #include <assert.h>
29 
30 #ifndef WIN32
31 #include <pthread.h>
32 #endif
33 
34 using namespace shogun;
35 
37  :CStringKernel<char>(0)
38 {
39  init();
41 }
42 
44  float64_t* AA_matrix_, int32_t nr, int32_t nc,
45  int32_t degree_, int32_t max_mismatch_, float64_t width_) : CStringKernel<char>(size),
46  alphabet(NULL), degree(degree_), max_mismatch(max_mismatch_), width(width_)
47 {
48  lhs=NULL;
49  rhs=NULL;
50 
51  target_letter_0=-1 ;
52 
53  AA_matrix=NULL;
54  set_AA_matrix(AA_matrix_, nr, nc);
56 }
57 
59  CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t nr, int32_t nc, int32_t degree_, int32_t max_mismatch_, float64_t width_)
60 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), max_mismatch(max_mismatch_), width(width_)
61 {
62  target_letter_0=-1 ;
63 
64  AA_matrix=NULL;
65  set_AA_matrix(AA_matrix_, nr, nc);
66  init(l, r);
68 }
69 
71 {
72  cleanup();
74 }
75 
76 
78 {
79 
81 }
82 
83 bool CSpectrumMismatchRBFKernel::init(CFeatures* l, CFeatures* r)
84 {
85  int32_t lhs_changed=(lhs!=l);
86  int32_t rhs_changed=(rhs!=r);
87 
89 
90  SG_DEBUG("lhs_changed: %i\n", lhs_changed);
91  SG_DEBUG("rhs_changed: %i\n", rhs_changed);
92 
95 
97  alphabet=sf_l->get_alphabet();
98  CAlphabet* ralphabet=sf_r->get_alphabet();
99 
100  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
101  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
102 
103  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
104  SG_UNREF(ralphabet);
105 
106  compute_all() ;
107 
108  return init_normalizer();
109 }
110 
112 {
113 
115  alphabet=NULL;
116 
118 }
119 
120 float64_t CSpectrumMismatchRBFKernel::AA_helper(std::string &path, const char* joint_seq, unsigned int index)
121 {
122  float64_t diff=0.0 ;
123 
124  for (unsigned int i=0; i<path.size(); i++)
125  {
126  if (path[i]!=joint_seq[index+i])
127  {
128  diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
129  diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
130  diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
131  }
132  }
133 
134  return exp( - diff/width) ;
135 }
136 
137 /*
138 float64_t CSpectrumMismatchRBFKernel::compute_helper(const char* joint_seq,
139  std::vector<unsigned int> joint_index, std::vector<unsigned int> joint_mismatch,
140  std::string path, unsigned int d,
141  const int & alen)
142 {
143  const char* AA = "ACDEFGHIKLMNPQRSTVWY" ;
144  const unsigned int num_AA = strlen(AA) ;
145 
146  assert(path.size()==d) ;
147  assert(joint_mismatch.size()==joint_index.size()) ;
148 
149  float64_t res = 0.0 ;
150 
151  for (unsigned int i=0; i<num_AA; i++)
152  {
153  std::vector<unsigned int> joint_mismatch_ ;
154  std::vector<unsigned int> joint_index_ ;
155 
156  for (unsigned int j=0; j<joint_index.size(); j++)
157  {
158  if (joint_seq[joint_index[j]+d] != AA[i])
159  {
160  if (joint_mismatch[j]+1 <= (unsigned int) max_mismatch)
161  {
162  joint_mismatch_.push_back(joint_mismatch[j]+1) ;
163  joint_index_.push_back(joint_index[j]) ;
164  }
165  }
166  else
167  {
168  joint_mismatch_.push_back(joint_mismatch[j]) ;
169  joint_index_.push_back(joint_index[j]) ;
170  }
171  }
172  if (joint_mismatch_.size()>0)
173  {
174  std::string path_ = path + AA[i] ;
175 
176  if (d+1 < (unsigned int) degree)
177  {
178  res += compute_helper(joint_seq, joint_index_, joint_mismatch_, path_, d+1, alen) ;
179  }
180  else
181  {
182  int anum=0, bnum=0;
183  for (unsigned int j=0; j<joint_index_.size(); j++)
184  if (joint_index_[j] < (unsigned int)alen)
185  {
186  if (1)
187  {
188  anum++ ;
189  if (joint_mismatch_[j]==0)
190  anum+=3 ;
191  }
192  else
193  {
194  if (joint_mismatch_[j]!=0)
195  anum += AA_helper(path_, joint_seq, joint_index_[j]) ;
196  else
197  anum++ ;
198  }
199  }
200  else
201  {
202  if (1)
203  {
204  bnum++ ;
205  if (joint_mismatch_[j]==0)
206  bnum+=3 ;
207  }
208  else
209  {
210  if (joint_mismatch_[j]!=0)
211  bnum += AA_helper(path_, joint_seq, joint_index_[j]) ;
212  else
213  bnum++ ;
214  }
215  }
216 
217  //fprintf(stdout, "%s: %i x %i\n", path_.c_str(), anum, bnum) ;
218 
219  res+= anum*bnum ;
220  }
221  }
222  }
223  return res ;
224 }
225 */
226 
228  std::vector<struct joint_list_struct> &joint_list,
229  std::string path, unsigned int d)
230 {
231  const char* AA = "ACDEFGHIKLMNPQRSTVWY" ;
232  const unsigned int num_AA = strlen(AA) ;
233 
234  assert(path.size()==d) ;
235 
236  for (unsigned int i=0; i<num_AA; i++)
237  {
238  std::vector<struct joint_list_struct> joint_list_ ;
239 
240  if (d==0)
241  fprintf(stderr, "i=%i: ", i) ;
242  if (d==0 && target_letter_0!=-1 && (int)i != target_letter_0 )
243  continue ;
244 
245  if (d==1)
246  {
247  fprintf(stdout, "*") ;
248  fflush(stdout) ;
249  }
250  if (d==2)
251  {
252  fprintf(stdout, "+") ;
253  fflush(stdout) ;
254  }
255 
256  for (unsigned int j=0; j<joint_list.size(); j++)
257  {
258  if (joint_seq[joint_list[j].index+d] != AA[i])
259  {
260  if (joint_list[j].mismatch+1 <= (unsigned int) max_mismatch)
261  {
262  struct joint_list_struct list_item ;
263  list_item = joint_list[j] ;
264  list_item.mismatch = joint_list[j].mismatch+1 ;
265  joint_list_.push_back(list_item) ;
266  }
267  }
268  else
269  joint_list_.push_back(joint_list[j]) ;
270  }
271 
272  if (joint_list_.size()>0)
273  {
274  std::string path_ = path + AA[i] ;
275 
276  if (d+1 < (unsigned int) degree)
277  {
278  compute_helper_all(joint_seq, joint_list_, path_, d+1) ;
279  }
280  else
281  {
284  feats.set_const(0) ;
285 
286  for (unsigned int j=0; j<joint_list_.size(); j++)
287  {
288  if (width==0.0)
289  {
290  feats[joint_list_[j].ex_index]++ ;
291  //if (joint_mismatch_[j]==0)
292  // feats[joint_ex_index_[j]]+=3 ;
293  }
294  else
295  {
296  if (joint_list_[j].mismatch!=0)
297  feats[joint_list_[j].ex_index] += AA_helper(path_, joint_seq, joint_list_[j].index) ;
298  else
299  feats[joint_list_[j].ex_index] ++ ;
300  }
301  }
302 
303  std::vector<int> idx ;
304  for (int r=0; r<feats.get_array_size(); r++)
305  if (feats[r]!=0.0)
306  idx.push_back(r) ;
307 
308  for (unsigned int r=0; r<idx.size(); r++)
309  for (unsigned int s=r; s<idx.size(); s++)
310  if (s==r)
311  kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[r],idx[s]), idx[r], idx[s]) ;
312  else
313  {
314  kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[r],idx[s]), idx[r], idx[s]) ;
315  kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[s],idx[r]), idx[s], idx[r]) ;
316  }
317  }
318  }
319  if (d==0)
320  fprintf(stdout, "\n") ;
321  }
322 }
323 
325 {
326  std::string joint_seq ;
327  std::vector<struct joint_list_struct> joint_list ;
328 
329  assert(lhs->get_num_vectors()==rhs->get_num_vectors()) ;
332  for (int i=0; i<lhs->get_num_vectors(); i++)
333  for (int j=0; j<lhs->get_num_vectors(); j++)
334  kernel_matrix.set_element(0, i, j) ;
335 
336  for (int i=0; i<lhs->get_num_vectors(); i++)
337  {
338  int32_t alen ;
339  bool free_avec ;
340  char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, alen, free_avec);
341 
342  for (int apos=0; apos+degree-1<alen; apos++)
343  {
344  struct joint_list_struct list_item ;
345  list_item.ex_index = i ;
346  list_item.index = apos+joint_seq.size() ;
347  list_item.mismatch = 0 ;
348 
349  joint_list.push_back(list_item) ;
350  }
351  joint_seq += std::string(avec, alen) ;
352 
353  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, i, free_avec);
354  }
355 
356  compute_helper_all(joint_seq.c_str(), joint_list, "", 0) ;
357 }
358 
359 
360 float64_t CSpectrumMismatchRBFKernel::compute(int32_t idx_a, int32_t idx_b)
361 {
362  return kernel_matrix.element(idx_a, idx_b) ;
363 }
364 /*
365 bool CSpectrumMismatchRBFKernel::set_weights(
366  float64_t* ws, int32_t d, int32_t len)
367 {
368  if (d==128 && len==128)
369  {
370  SG_DEBUG("Setting AA_matrix\n") ;
371  memcpy(AA_matrix, ws, 128*128*sizeof(float64_t)) ;
372  return true ;
373  }
374 
375  if (d==1 && len==1)
376  {
377  sigma=ws[0] ;
378  SG_DEBUG("Setting sigma to %e\n", sigma) ;
379  return true ;
380  }
381 
382  if (d==2 && len==2)
383  {
384  target_letter_0=ws[0] ;
385  SG_DEBUG("Setting target letter to %c\n", target_letter_0) ;
386  return true ;
387  }
388 
389  if (d!=degree || len<1)
390  SG_ERROR("Dimension mismatch (should be de(seq_length | 1) x degree)\n");
391 
392  length=len;
393 
394  if (length==0)
395  length=1;
396 
397  int32_t num_weights=degree*(length+max_mismatch);
398  SG_FREE(weights);
399  weights=SG_MALLOC(float64_t, num_weights);
400 
401  if (weights)
402  {
403  for (int32_t i=0; i<num_weights; i++) {
404  if (ws[i]) // len(ws) might be != num_weights?
405  weights[i]=ws[i];
406  }
407  return true;
408  }
409  else
410  return false;
411 }
412 */
413 
414 bool CSpectrumMismatchRBFKernel::set_AA_matrix(float64_t* AA_matrix_, int32_t nr, int32_t nc)
415 {
416  if (AA_matrix_)
417  {
418  if (nr!=128 || nc!=128)
419  SG_ERROR("AA_matrix should be of shape 128x128\n");
421  AA_matrix=SG_MALLOC(float64_t, nc*nr);
422  memcpy(AA_matrix, AA_matrix_, nc*nr*sizeof(float64_t)) ;
423  SG_DEBUG("Setting AA_matrix\n") ;
424  memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
425  return true ;
426  }
427 
428  return false;
429 }
430 
432 {
433  max_mismatch=max;
434 
435  if (lhs!=NULL && rhs!=NULL)
436  return init(lhs, rhs);
437  else
438  return true;
439 }
440 
442 {
443  SG_ADD(&degree, "degree", "degree of the kernel", MS_AVAILABLE);
444  SG_ADD(&AA_matrix_length, "AA_matrix_length",
445  "the length of AA matrix", MS_NOT_AVAILABLE);
447  "128*128 scalar product matrix");
448  SG_ADD(&width,"width", "width of Gaussian", MS_AVAILABLE);
449  SG_ADD(&target_letter_0, "target_letter_0", "target letter 0",
451  SG_ADD(&initialized, "initialized",
452  "the mark of initialization status", MS_NOT_AVAILABLE);
454  &kernel_matrix_length, "kernel_matrix",
455  "the kernel matrix with its length defined by the number of "
456  "vectors of the string features");
457 }
458 
460 {
461  SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel",
463 }
464 
465 void CSpectrumMismatchRBFKernel::init()
466 {
467  alphabet = NULL;
468  degree = 0;
469  max_mismatch = 0;
470  AA_matrix = NULL;
471  width = 0.0;
472 
473  initialized = false;
474  target_letter_0 = 0;
475 }
476 

SHOGUN Machine Learning Toolbox - Documentation