SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
SpectrumRBFKernel.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <vector>
13 
14 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/lib/Trie.h>
18 #include <shogun/base/Parallel.h>
19 
25 
26 #include <vector>
27 #include <string>
28 #include <fstream>
29 #include <cmath>
30 
31 #include <assert.h>
32 
33 #ifdef HAVE_PTHREAD
34 #include <pthread.h>
35 #endif
36 
37 
38 using namespace shogun;
39 
41  : CStringKernel<char>(0)
42 {
43  init();
45 }
46 
47 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
48  : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
49 {
50  init();
52 
53  target_letter_0=-1 ;
54 
56 
57  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
58 
60  SGStringList<char> string_list;
61  string_list.strings = sequences;
62  string_list.num_strings = nof_sequences;
64 
65  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
69 }
70 
72  CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
73 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
74 {
75  target_letter_0=-1 ;
76 
78  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
79 
80  init(l, r);
82 }
83 
85 {
86  cleanup();
88  SG_FREE(sequences);
89 }
90 
92 {
93 
94  int32_t aa_to_index[128];//profile
95  aa_to_index[(uint8_t) 'A'] = 0;
96  aa_to_index[(uint8_t) 'R'] = 1;
97  aa_to_index[(uint8_t) 'N'] = 2;
98  aa_to_index[(uint8_t) 'D'] = 3;
99  aa_to_index[(uint8_t) 'C'] = 4;
100  aa_to_index[(uint8_t) 'Q'] = 5;
101  aa_to_index[(uint8_t) 'E'] = 6;
102  aa_to_index[(uint8_t) 'G'] = 7;
103  aa_to_index[(uint8_t) 'H'] = 8;
104  aa_to_index[(uint8_t) 'I'] = 9;
105  aa_to_index[(uint8_t) 'L'] = 10;
106  aa_to_index[(uint8_t) 'K'] = 11;
107  aa_to_index[(uint8_t) 'M'] = 12;
108  aa_to_index[(uint8_t) 'F'] = 13;
109  aa_to_index[(uint8_t) 'P'] = 14;
110  aa_to_index[(uint8_t) 'S'] = 15;
111  aa_to_index[(uint8_t) 'T'] = 16;
112  aa_to_index[(uint8_t) 'W'] = 17;
113  aa_to_index[(uint8_t) 'Y'] = 18;
114  aa_to_index[(uint8_t) 'V'] = 19;
115  SG_DEBUG("initializing background\n")
116  double background[20]; // profile
117  background[0]=0.0799912015849807; //A
118  background[1]=0.0484482507611578;//R
119  background[2]=0.044293531582512;//N
120  background[3]=0.0578891399707563;//D
121  background[4]=0.0171846021407367;//C
122  background[5]=0.0380578923048682;//Q
123  background[6]=0.0638169929675978;//E
124  background[7]=0.0760659374742852;//G
125  background[8]=0.0223465499452473;//H
126  background[9]=0.0550905793661343;//I
127  background[10]=0.0866897071203864;//L
128  background[11]=0.060458245507428;//K
129  background[12]=0.0215379186368154;//M
130  background[13]=0.0396348024787477;//F
131  background[14]=0.0465746314476874;//P
132  background[15]=0.0630028230885602;//S
133  background[16]=0.0580394726014824;//T
134  background[17]=0.0144991866213453;//W
135  background[18]=0.03635438623143;//Y
136  background[19]=0.0700241481678408;//V
137 
138 
139  std::vector<std::string> seqs;
140  //int32_t nof_sequences = 7329;
141 
142  double C = 0.8;
143  const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
144  std::ifstream fin(filename);
145 
146  SG_DEBUG("Reading profiles from %s\n", filename)
147  std::string line;
148  while (!fin.eof())
149  {
150  std::getline(fin, line);
151 
152  if (line[0] == '>') // new sequence
153  {
154  int idx = line.find_first_of(' ');
155  sequence_labels.push_back(line.substr(1,idx-1));
156  std::getline(fin, line);
157  std::string orig_sequence = line;
158  std::string sequence="";
159 
160  int len_line = line.length();
161 
162  // skip 3 lines
163 
164  std::getline(fin, line);
165  std::getline(fin, line);
166  std::getline(fin, line);
167 
168  profiles.push_back(std::vector<double>());
169 
170  std::vector<double>& curr_profile = profiles.back();
171  for (int i=0; i < len_line; ++i)
172  {
173  std::getline(fin, line);
174  int a = line.find_first_not_of(' '); // index position
175  int b = line.find_first_of(' ', a); // index position
176  a = line.find_first_not_of(' ', b); // aa position
177  b = line.find_first_of(' ', a); // aa position
178  std::string aa=line.substr(a,b-a);
179  if (0) //(aa =="B" || aa == "X" || aa == "Z")
180  {
181  int pos = seqs.size()+1;
182  SG_DEBUG("Skipping aa in sequence %d\n", pos)
183  continue;
184  }
185  else
186  {
187  sequence += aa;
188 
189  a = line.find_first_not_of(' ', b); // beginning of block to ignore
190  b = line.find_first_of(' ', a); // aa position
191 
192  for (int j=0; j < 19; ++j)
193  {
194  a = line.find_first_not_of(' ', b);
195  b = line.find_first_of(' ', a);
196  }
197 
198  int all_zeros = 1;
199  // interesting block
200  for (int j=0; j < 20; ++j)
201  {
202  a = line.find_first_not_of(' ', b);
203  b = line.find_first_of(' ', a);
204  double p = atof(line.substr(a, b-a).c_str());
205  if (p > 0)
206  {
207  all_zeros = 0;
208  }
209  double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
210  curr_profile.push_back(value);
211  //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j])
212  }
213 
214  if (all_zeros)
215  {
216  SG_DEBUG(">>>>>>>>>>>>>>> all zeros")
217  if (aa !="B" && aa != "X" && aa != "Z")
218  {
219  //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
220  int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
221  double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
222  SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index])
223  curr_profile[(i*20) + aa_index] = value;
224  SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value)
225 
226  /*
227  for (int z=0; z <20; ++z)
228  {
229  SG_DEBUG(" %d \t %f\t", z, curr_profile[z])
230  }
231  SG_DEBUG("\n")
232  */
233  }
234  }
235  }
236  }
237 
238  if (curr_profile.size() != 20 * sequence.length())
239  {
240  SG_ERROR("Something's wrong with the profile.\n")
241  break;
242  }
243 
244  seqs.push_back(sequence);
245 
246 
247  /*
248  // 6 irrelevant lines
249  for (int i=0; i < 6; ++i)
250  {
251  std::getline(fin, line);
252  }
253  //
254  */
255  }
256  }
257 
258  fin.close();
259 
260  nof_sequences = seqs.size();
261  sequences = SG_MALLOC(SGString<char>, nof_sequences);
262 
263  int max_len = 0;
264  for (int i=0; i < nof_sequences; ++i)
265  {
266  int len = seqs[i].length();
267  sequences[i].string = SG_MALLOC(char, len+1);
268  sequences[i].slen = len;
269  strcpy(sequences[i].string, seqs[i].c_str());
270 
271  if (len > max_len) max_len = len;
272  }
273 
274  max_sequence_length = max_len;
275  //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
276 
277 }
278 
279 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
280 {
281  // >> profile
282 /*
283  read_profiles_and_sequences();
284  l = string_features;
285  r = string_features;
286  */
287  // << profile
288 
289  int32_t lhs_changed=(lhs!=l);
290  int32_t rhs_changed=(rhs!=r);
291 
293 
294  SG_DEBUG("lhs_changed: %i\n", lhs_changed)
295  SG_DEBUG("rhs_changed: %i\n", rhs_changed)
296 
299 
301  alphabet=sf_l->get_alphabet();
302  CAlphabet* ralphabet=sf_r->get_alphabet();
303 
304  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
305  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
306 
307  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet())
308  SG_UNREF(ralphabet);
309 
310 
311  return init_normalizer();
312 }
313 
315 {
316 
318  alphabet=NULL;
319 
321 }
322 
323 inline bool isaa(char c)
324 {
325  if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
326  return false ;
327  return true ;
328 }
329 
330 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
331 {
332  //const char* AA = "ARNDCQEGHILKMFPSTWYV";
333  float64_t diff=0.0 ;
334 
335  for (int i=0; i<seq_degree; i++)
336  {
337  if (!isaa(path[i])||!isaa(joint_seq[index+i]))
338  diff+=1.4 ;
339  else
340  {
341  diff += AA_matrix.matrix[ (path[i]-1)*128 + path[i] - 1] ;
342  diff -= 2*AA_matrix.matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
343  diff += AA_matrix.matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
344  if (CMath::is_nan(diff))
345  fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
346  }
347  }
348 
349  return exp( - diff/width) ;
350 }
351 
352 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
353 {
354  int32_t alen, blen;
355  bool afree, bfree;
356 
357  char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
358  char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
359 
360  float64_t result=0;
361  for (int32_t i=0; i<alen; i++)
362  {
363  for (int32_t j=0; j<blen; j++)
364  {
365  if ((i+degree<=alen) && (j+degree<=blen))
366  result += AA_helper(&(avec[i]), degree, bvec, j) ;
367  }
368  }
369 
370  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
371  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
372  return result;
373 }
374 
376  float64_t* AA_matrix_)
377 {
378 
379  if (AA_matrix_)
380  {
381  SG_DEBUG("Setting AA_matrix\n")
382  memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
383  return true ;
384  }
385 
386  return false;
387 }
388 
390 {
391  SG_ADD(&degree, "degree", "degree of the kernel", MS_AVAILABLE);
392  SG_ADD(&AA_matrix, "AA_matrix", "128*128 scalar product matrix", MS_NOT_AVAILABLE);
393  SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE);
394  SG_ADD(&nof_sequences, "nof_sequences", "length of the sequence",
396  m_parameters->add_vector(&sequences, &nof_sequences, "sequences", "the sequences as a part of profile");
398  "max_sequence_length", "max length of the sequence", MS_NOT_AVAILABLE);
399 }
400 
402 {
403  SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel",
405 }
406 
407 void CSpectrumRBFKernel::init()
408 {
409  alphabet = NULL;
410  degree = 0;
411  width = 0.0;
412  sequences = NULL;
413  string_features = NULL;
414  nof_sequences = 0;
416 
417  initialized = false;
418 
419  max_mismatch = 0;
420  target_letter_0 = 0;
421 }
RNA - letters A,C,G,U.
Definition: Alphabet.h:32
virtual void cleanup()
Definition: Kernel.cpp:173
template class SGStringList
Definition: SGObject.h:40
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
virtual bool init(CFeatures *l, CFeatures *r)
EAlphabet get_alphabet() const
Definition: Alphabet.h:130
#define SG_ERROR(...)
Definition: SGIO.h:129
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
Parameter * m_parameters
Definition: SGObject.h:378
float64_t AA_helper(const char *path, const int degree, const char *joint_seq, unsigned int index)
CStringFeatures< char > * string_features
uint64_t properties
Definition: Kernel.h:1082
IUPAC_AMINO_ACID.
Definition: Alphabet.h:53
bool set_AA_matrix(float64_t *AA_matrix_)
#define SG_REF(x)
Definition: SGObject.h:51
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
SGString< char > * sequences
std::vector< std::vector< float64_t > > profiles
double float64_t
Definition: common.h:50
virtual bool init_normalizer()
Definition: Kernel.cpp:168
bool isaa(char c)
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1061
#define SG_UNREF(x)
Definition: SGObject.h:52
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:334
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1059
static int is_nan(double f)
checks whether a float is nan
Definition: Math.cpp:250
The class Features is the base class of all feature objects.
Definition: Features.h:68
SGString< T > * strings
Definition: SGStringList.h:88
index_t slen
Definition: SGString.h:79
#define SG_ADD(...)
Definition: SGObject.h:81
float64_t compute(int32_t idx_a, int32_t idx_b)
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
SGMatrix< float64_t > AA_matrix
std::vector< std::string > sequence_labels

SHOGUN 机器学习工具包 - 项目文档