SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SpectrumRBFKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <vector>
13 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/lib/Trie.h>
18 #include <shogun/base/Parallel.h>
19 
23 #include <math.h>
24 
25 #include <vector>
26 #include <string>
27 #include <fstream>
28 #include <cmath>
29 
30 #include <assert.h>
31 
32 #ifdef HAVE_PTHREAD
33 #include <pthread.h>
34 #endif
35 
36 
37 using namespace shogun;
38 
40  : CStringKernel<char>(0)
41 {
42  init();
44 }
45 
46 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
47  : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
48 {
49  init();
51 
52  target_letter_0=-1 ;
53 
55 
56  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
57 
59  SGStringList<char> string_list;
60  string_list.strings = sequences;
61  string_list.num_strings = nof_sequences;
63 
64  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
68 }
69 
71  CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
72 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
73 {
74  target_letter_0=-1 ;
75 
77  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
78 
79  init(l, r);
81 }
82 
84 {
85  cleanup();
87  SG_FREE(sequences);
88 }
89 
91 {
92 
93  int32_t aa_to_index[128];//profile
94  aa_to_index[(uint8_t) 'A'] = 0;
95  aa_to_index[(uint8_t) 'R'] = 1;
96  aa_to_index[(uint8_t) 'N'] = 2;
97  aa_to_index[(uint8_t) 'D'] = 3;
98  aa_to_index[(uint8_t) 'C'] = 4;
99  aa_to_index[(uint8_t) 'Q'] = 5;
100  aa_to_index[(uint8_t) 'E'] = 6;
101  aa_to_index[(uint8_t) 'G'] = 7;
102  aa_to_index[(uint8_t) 'H'] = 8;
103  aa_to_index[(uint8_t) 'I'] = 9;
104  aa_to_index[(uint8_t) 'L'] = 10;
105  aa_to_index[(uint8_t) 'K'] = 11;
106  aa_to_index[(uint8_t) 'M'] = 12;
107  aa_to_index[(uint8_t) 'F'] = 13;
108  aa_to_index[(uint8_t) 'P'] = 14;
109  aa_to_index[(uint8_t) 'S'] = 15;
110  aa_to_index[(uint8_t) 'T'] = 16;
111  aa_to_index[(uint8_t) 'W'] = 17;
112  aa_to_index[(uint8_t) 'Y'] = 18;
113  aa_to_index[(uint8_t) 'V'] = 19;
114  SG_DEBUG("initializing background\n")
115  double background[20]; // profile
116  background[0]=0.0799912015849807; //A
117  background[1]=0.0484482507611578;//R
118  background[2]=0.044293531582512;//N
119  background[3]=0.0578891399707563;//D
120  background[4]=0.0171846021407367;//C
121  background[5]=0.0380578923048682;//Q
122  background[6]=0.0638169929675978;//E
123  background[7]=0.0760659374742852;//G
124  background[8]=0.0223465499452473;//H
125  background[9]=0.0550905793661343;//I
126  background[10]=0.0866897071203864;//L
127  background[11]=0.060458245507428;//K
128  background[12]=0.0215379186368154;//M
129  background[13]=0.0396348024787477;//F
130  background[14]=0.0465746314476874;//P
131  background[15]=0.0630028230885602;//S
132  background[16]=0.0580394726014824;//T
133  background[17]=0.0144991866213453;//W
134  background[18]=0.03635438623143;//Y
135  background[19]=0.0700241481678408;//V
136 
137 
138  std::vector<std::string> seqs;
139  //int32_t nof_sequences = 7329;
140 
141  double C = 0.8;
142  const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
143  std::ifstream fin(filename);
144 
145  SG_DEBUG("Reading profiles from %s\n", filename)
146  std::string line;
147  while (!fin.eof())
148  {
149  std::getline(fin, line);
150 
151  if (line[0] == '>') // new sequence
152  {
153  int idx = line.find_first_of(' ');
154  sequence_labels.push_back(line.substr(1,idx-1));
155  std::getline(fin, line);
156  std::string orig_sequence = line;
157  std::string sequence="";
158 
159  int len_line = line.length();
160 
161  // skip 3 lines
162 
163  std::getline(fin, line);
164  std::getline(fin, line);
165  std::getline(fin, line);
166 
167  profiles.push_back(std::vector<double>());
168 
169  std::vector<double>& curr_profile = profiles.back();
170  for (int i=0; i < len_line; ++i)
171  {
172  std::getline(fin, line);
173  int a = line.find_first_not_of(' '); // index position
174  int b = line.find_first_of(' ', a); // index position
175  a = line.find_first_not_of(' ', b); // aa position
176  b = line.find_first_of(' ', a); // aa position
177  std::string aa=line.substr(a,b-a);
178  if (0) //(aa =="B" || aa == "X" || aa == "Z")
179  {
180  int pos = seqs.size()+1;
181  SG_DEBUG("Skipping aa in sequence %d\n", pos)
182  continue;
183  }
184  else
185  {
186  sequence += aa;
187 
188  a = line.find_first_not_of(' ', b); // beginning of block to ignore
189  b = line.find_first_of(' ', a); // aa position
190 
191  for (int j=0; j < 19; ++j)
192  {
193  a = line.find_first_not_of(' ', b);
194  b = line.find_first_of(' ', a);
195  }
196 
197  int all_zeros = 1;
198  // interesting block
199  for (int j=0; j < 20; ++j)
200  {
201  a = line.find_first_not_of(' ', b);
202  b = line.find_first_of(' ', a);
203  double p = atof(line.substr(a, b-a).c_str());
204  if (p > 0)
205  {
206  all_zeros = 0;
207  }
208  double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
209  curr_profile.push_back(value);
210  //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j])
211  }
212 
213  if (all_zeros)
214  {
215  SG_DEBUG(">>>>>>>>>>>>>>> all zeros")
216  if (aa !="B" && aa != "X" && aa != "Z")
217  {
218  //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
219  int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
220  double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
221  SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index])
222  curr_profile[(i*20) + aa_index] = value;
223  SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value)
224 
225  /*
226  for (int z=0; z <20; ++z)
227  {
228  SG_DEBUG(" %d \t %f\t", z, curr_profile[z])
229  }
230  SG_DEBUG("\n")
231  */
232  }
233  }
234  }
235  }
236 
237  if (curr_profile.size() != 20 * sequence.length())
238  {
239  SG_ERROR("Something's wrong with the profile.\n")
240  break;
241  }
242 
243  seqs.push_back(sequence);
244 
245 
246  /*
247  // 6 irrelevant lines
248  for (int i=0; i < 6; ++i)
249  {
250  std::getline(fin, line);
251  }
252  //
253  */
254  }
255  }
256 
257  fin.close();
258 
259  nof_sequences = seqs.size();
260  sequences = SG_MALLOC(SGString<char>, nof_sequences);
261 
262  int max_len = 0;
263  for (int i=0; i < nof_sequences; ++i)
264  {
265  int len = seqs[i].length();
266  sequences[i].string = SG_MALLOC(char, len+1);
267  sequences[i].slen = len;
268  strcpy(sequences[i].string, seqs[i].c_str());
269 
270  if (len > max_len) max_len = len;
271  }
272 
273  max_sequence_length = max_len;
274  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
275 
276 }
277 
278 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
279 {
280  // >> profile
281 /*
282  read_profiles_and_sequences();
283  l = string_features;
284  r = string_features;
285  */
286  // << profile
287 
288  int32_t lhs_changed=(lhs!=l);
289  int32_t rhs_changed=(rhs!=r);
290 
292 
293  SG_DEBUG("lhs_changed: %i\n", lhs_changed)
294  SG_DEBUG("rhs_changed: %i\n", rhs_changed)
295 
298 
300  alphabet=sf_l->get_alphabet();
301  CAlphabet* ralphabet=sf_r->get_alphabet();
302 
303  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
304  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
305 
306  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet())
307  SG_UNREF(ralphabet);
308 
309 
310  return init_normalizer();
311 }
312 
314 {
315 
317  alphabet=NULL;
318 
320 }
321 
322 inline bool isaa(char c)
323 {
324  if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
325  return false ;
326  return true ;
327 }
328 
329 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
330 {
331  //const char* AA = "ARNDCQEGHILKMFPSTWYV";
332  float64_t diff=0.0 ;
333 
334  for (int i=0; i<seq_degree; i++)
335  {
336  if (!isaa(path[i])||!isaa(joint_seq[index+i]))
337  diff+=1.4 ;
338  else
339  {
340  diff += AA_matrix.matrix[ (path[i]-1)*128 + path[i] - 1] ;
341  diff -= 2*AA_matrix.matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
342  diff += AA_matrix.matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
343  if (CMath::is_nan(diff))
344  fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
345  }
346  }
347 
348  return exp( - diff/width) ;
349 }
350 
351 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
352 {
353  int32_t alen, blen;
354  bool afree, bfree;
355 
356  char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
357  char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
358 
359  float64_t result=0;
360  for (int32_t i=0; i<alen; i++)
361  {
362  for (int32_t j=0; j<blen; j++)
363  {
364  if ((i+degree<=alen) && (j+degree<=blen))
365  result += AA_helper(&(avec[i]), degree, bvec, j) ;
366  }
367  }
368 
369  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
370  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
371  return result;
372 }
373 
375  float64_t* AA_matrix_)
376 {
377 
378  if (AA_matrix_)
379  {
380  SG_DEBUG("Setting AA_matrix\n")
381  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
382  return true ;
383  }
384 
385  return false;
386 }
387 
389 {
390  SG_ADD(&degree, "degree", "degree of the kernel", MS_AVAILABLE);
391  SG_ADD(&AA_matrix, "AA_matrix", "128*128 scalar product matrix", MS_NOT_AVAILABLE);
392  SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE);
393  SG_ADD(&nof_sequences, "nof_sequences", "length of the sequence",
395  m_parameters->add_vector(&sequences, &nof_sequences, "sequences", "the sequences as a part of profile");
397  "max_sequence_length", "max length of the sequence", MS_NOT_AVAILABLE);
398 }
399 
401 {
402  SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel",
404 }
405 
406 void CSpectrumRBFKernel::init()
407 {
408  alphabet = NULL;
409  degree = 0;
410  width = 0.0;
411  sequences = NULL;
412  string_features = NULL;
413  nof_sequences = 0;
415 
416  initialized = false;
417 
418  max_mismatch = 0;
419  target_letter_0 = 0;
420 }

SHOGUN Machine Learning Toolbox - Documentation