SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SpectrumRBFKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <vector>
13 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/lib/Trie.h>
18 #include <shogun/base/Parallel.h>
19 
23 #include <math.h>
24 
25 #include <vector>
26 #include <string>
27 #include <fstream>
28 #include <cmath>
29 
30 #include <assert.h>
31 
32 #ifdef HAVE_PTHREAD
33 #include <pthread.h>
34 #endif
35 
36 
37 using namespace shogun;
38 
40  : CStringKernel<char>(0)
41 {
42  init();
44 }
45 
46 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
47  : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
48 {
49  lhs=NULL;
50  rhs=NULL;
51 
52  target_letter_0=-1 ;
53 
54  AA_matrix=SG_MALLOC(float64_t, 128*128);
55 
56 
57  memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
58 
60  SGStringList<char> string_list;
61  string_list.strings = sequences;
62  string_list.num_strings = nof_sequences;
64 
65  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
69 }
70 
72  CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
73 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
74 {
75  target_letter_0=-1 ;
76 
77  AA_matrix=SG_MALLOC(float64_t, 128*128);
78  memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
79 
80  init(l, r);
82 }
83 
85 {
86  cleanup();
88 }
89 
90 
92 {
93 
95 }
96 
98 {
99 
100  int32_t aa_to_index[128];//profile
101  aa_to_index[(uint8_t) 'A'] = 0;
102  aa_to_index[(uint8_t) 'R'] = 1;
103  aa_to_index[(uint8_t) 'N'] = 2;
104  aa_to_index[(uint8_t) 'D'] = 3;
105  aa_to_index[(uint8_t) 'C'] = 4;
106  aa_to_index[(uint8_t) 'Q'] = 5;
107  aa_to_index[(uint8_t) 'E'] = 6;
108  aa_to_index[(uint8_t) 'G'] = 7;
109  aa_to_index[(uint8_t) 'H'] = 8;
110  aa_to_index[(uint8_t) 'I'] = 9;
111  aa_to_index[(uint8_t) 'L'] = 10;
112  aa_to_index[(uint8_t) 'K'] = 11;
113  aa_to_index[(uint8_t) 'M'] = 12;
114  aa_to_index[(uint8_t) 'F'] = 13;
115  aa_to_index[(uint8_t) 'P'] = 14;
116  aa_to_index[(uint8_t) 'S'] = 15;
117  aa_to_index[(uint8_t) 'T'] = 16;
118  aa_to_index[(uint8_t) 'W'] = 17;
119  aa_to_index[(uint8_t) 'Y'] = 18;
120  aa_to_index[(uint8_t) 'V'] = 19;
121  SG_DEBUG("initializing background\n");
122  double background[20]; // profile
123  background[0]=0.0799912015849807; //A
124  background[1]=0.0484482507611578;//R
125  background[2]=0.044293531582512;//N
126  background[3]=0.0578891399707563;//D
127  background[4]=0.0171846021407367;//C
128  background[5]=0.0380578923048682;//Q
129  background[6]=0.0638169929675978;//E
130  background[7]=0.0760659374742852;//G
131  background[8]=0.0223465499452473;//H
132  background[9]=0.0550905793661343;//I
133  background[10]=0.0866897071203864;//L
134  background[11]=0.060458245507428;//K
135  background[12]=0.0215379186368154;//M
136  background[13]=0.0396348024787477;//F
137  background[14]=0.0465746314476874;//P
138  background[15]=0.0630028230885602;//S
139  background[16]=0.0580394726014824;//T
140  background[17]=0.0144991866213453;//W
141  background[18]=0.03635438623143;//Y
142  background[19]=0.0700241481678408;//V
143 
144 
145  std::vector<std::string> seqs;
146  //int32_t nof_sequences = 7329;
147 
148  double C = 0.8;
149  const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
150  std::ifstream fin(filename);
151 
152  SG_DEBUG("Reading profiles from %s\n", filename);
153  std::string line;
154  while (!fin.eof())
155  {
156  std::getline(fin, line);
157 
158  if (line[0] == '>') // new sequence
159  {
160  int idx = line.find_first_of(' ');
161  sequence_labels.push_back(line.substr(1,idx-1));
162  std::getline(fin, line);
163  std::string orig_sequence = line;
164  std::string sequence="";
165 
166  int len_line = line.length();
167 
168  // skip 3 lines
169 
170  std::getline(fin, line);
171  std::getline(fin, line);
172  std::getline(fin, line);
173 
174  profiles.push_back(std::vector<double>());
175 
176  std::vector<double>& curr_profile = profiles.back();
177  for (int i=0; i < len_line; ++i)
178  {
179  std::getline(fin, line);
180  int a = line.find_first_not_of(' '); // index position
181  int b = line.find_first_of(' ', a); // index position
182  a = line.find_first_not_of(' ', b); // aa position
183  b = line.find_first_of(' ', a); // aa position
184  std::string aa=line.substr(a,b-a);
185  if (0) //(aa =="B" || aa == "X" || aa == "Z")
186  {
187  int pos = seqs.size()+1;
188  SG_DEBUG("Skipping aa in sequence %d\n", pos);
189  continue;
190  }
191  else
192  {
193  sequence += aa;
194 
195  a = line.find_first_not_of(' ', b); // beginning of block to ignore
196  b = line.find_first_of(' ', a); // aa position
197 
198  for (int j=0; j < 19; ++j)
199  {
200  a = line.find_first_not_of(' ', b);
201  b = line.find_first_of(' ', a);
202  }
203 
204  int all_zeros = 1;
205  // interesting block
206  for (int j=0; j < 20; ++j)
207  {
208  a = line.find_first_not_of(' ', b);
209  b = line.find_first_of(' ', a);
210  double p = atof(line.substr(a, b-a).c_str());
211  if (p > 0)
212  {
213  all_zeros = 0;
214  }
215  double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
216  curr_profile.push_back(value);
217  //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j]);
218  }
219 
220  if (all_zeros)
221  {
222  SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
223  if (aa !="B" && aa != "X" && aa != "Z")
224  {
225  //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
226  int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
227  double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
228  SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
229  curr_profile[(i*20) + aa_index] = value;
230  SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
231 
232  /*
233  for (int z=0; z <20; ++z)
234  {
235  SG_DEBUG(" %d \t %f\t", z, curr_profile[z]);
236  }
237  SG_DEBUG("\n");
238  */
239  }
240  }
241  }
242  }
243 
244  if (curr_profile.size() != 20 * sequence.length())
245  {
246  SG_ERROR("Something's wrong with the profile.\n");
247  break;
248  }
249 
250  seqs.push_back(sequence);
251 
252 
253  /*
254  // 6 irrelevant lines
255  for (int i=0; i < 6; ++i)
256  {
257  std::getline(fin, line);
258  }
259  //
260  */
261  }
262  }
263 
264  fin.close();
265 
266  nof_sequences = seqs.size();
268 
269  int max_len = 0;
270  for (int i=0; i < nof_sequences; ++i)
271  {
272  int len = seqs[i].length();
273  sequences[i].string = SG_MALLOC(char, len+1);
274  sequences[i].slen = len;
275  strcpy(sequences[i].string, seqs[i].c_str());
276 
277  if (len > max_len) max_len = len;
278  }
279 
280  max_sequence_length = max_len;
281  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
282 
283 }
284 
285 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
286 {
287  // >> profile
288 /*
289  read_profiles_and_sequences();
290  l = string_features;
291  r = string_features;
292  */
293  // << profile
294 
295  int32_t lhs_changed=(lhs!=l);
296  int32_t rhs_changed=(rhs!=r);
297 
299 
300  SG_DEBUG("lhs_changed: %i\n", lhs_changed);
301  SG_DEBUG("rhs_changed: %i\n", rhs_changed);
302 
305 
307  alphabet=sf_l->get_alphabet();
308  CAlphabet* ralphabet=sf_r->get_alphabet();
309 
310  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
311  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
312 
313  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
314  SG_UNREF(ralphabet);
315 
316 
317  return init_normalizer();
318 }
319 
321 {
322 
324  alphabet=NULL;
325 
327 }
328 
329 inline bool isaa(char c)
330 {
331  if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
332  return false ;
333  return true ;
334 }
335 
336 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
337 {
338  //const char* AA = "ARNDCQEGHILKMFPSTWYV";
339  float64_t diff=0.0 ;
340 
341  for (int i=0; i<seq_degree; i++)
342  {
343  if (!isaa(path[i])||!isaa(joint_seq[index+i]))
344  diff+=1.4 ;
345  else
346  {
347  diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
348  diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
349  diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
350  if (CMath::is_nan(diff))
351  fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
352  }
353  }
354 
355  return exp( - diff/width) ;
356 }
357 
358 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
359 {
360  int32_t alen, blen;
361  bool afree, bfree;
362 
363  char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
364  char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
365 
366  float64_t result=0;
367  for (int32_t i=0; i<alen; i++)
368  {
369  for (int32_t j=0; j<blen; j++)
370  {
371  if ((i+degree<=alen) && (j+degree<=blen))
372  result += AA_helper(&(avec[i]), degree, bvec, j) ;
373  }
374  }
375 
376  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
377  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
378  return result;
379 }
380 
382  float64_t* AA_matrix_)
383 {
384 
385  if (AA_matrix_)
386  {
387  SG_DEBUG("Setting AA_matrix\n") ;
388  memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
389  return true ;
390  }
391 
392  return false;
393 }
394 
396 {
397  SG_ADD(&degree, "degree", "degree of the kernel", MS_AVAILABLE);
398  SG_ADD(&AA_matrix_length, "AA_matrix_length", "the length of AA matrix",
400  m_parameters->add_vector(&AA_matrix, &AA_matrix_length, "AA_matrix", "128*128 scalar product matrix");
401  SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE);
402  SG_ADD(&nof_sequences, "nof_sequences", "length of the sequence",
404  m_parameters->add_vector(&sequences, &nof_sequences, "the sequences as a part of profile");
406  "max_sequence_length", "max length of the sequence", MS_NOT_AVAILABLE);
407 }
408 
410 {
411  SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel",
413 }
414 
415 void CSpectrumRBFKernel::init()
416 {
417  alphabet = NULL;
418  degree = 0;
419  AA_matrix = NULL;
420  AA_matrix_length = 128*128;
421  width = 0.0;
422  sequences = NULL;
423  string_features = NULL;
424  nof_sequences = 0;
426 
427  initialized = false;
428 
429  max_mismatch = 0;
430  target_letter_0 = 0;
431 }

SHOGUN Machine Learning Toolbox - Documentation