SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
LocalAlignmentStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * Compute the local alignment kernel
3  *
4  * Largely based on LAkernel.c (version 0.3)
5  *
6  * Copyright 2003 Jean-Philippe Vert
7  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
8  *
9  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
10  *
11  * Reference:
12  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
13  * detection using string alignment kernels", Bioinformatics,
14  * vol.20, p.1682-1689, 2004.
15  *
16  * This program is free software; you can redistribute it and/or modify
17  * it under the terms of the GNU General Public License as published by
18  * the Free Software Foundation; either version 3 of the License, or
19  * (at your option) any later version.
20  */
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <math.h>
25 #include <ctype.h>
26 #include <string.h>
28 
29 using namespace shogun;
30 
31 /****************/
32 /* The alphabet */
33 /****************/
34 
35 const int32_t NAA=20; /* Number of amino-acids */
36 const int32_t NLET=26; /* Number of letters in the alphabet */
37 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */
38 
39 /*****************/
40 /* SW parameters */
41 /*****************/
42 
43 /* mutation matrix */
44 const int32_t CLocalAlignmentStringKernel::blosum[] = {
45  6,
46  -2, 8,
47  -2, -1, 9,
48  -3, -2, 2, 9,
49  -1, -5, -4, -5, 13,
50  -1, 1, 0, 0, -4, 8,
51  -1, 0, 0, 2, -5, 3, 7,
52  0, -3, -1, -2, -4, -3, -3, 8,
53  -2, 0, 1, -2, -4, 1, 0, -3, 11,
54  -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
55  -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
56  -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
57  -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
58  -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
59  -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
60  2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
61  0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
62  -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
63  -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
64  0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
65 
66 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
67 static int32_t BINDEX(int32_t i, int32_t j)
68 {
69  return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2));
70 }
71 
72 /*********************
73  * Kernel parameters *
74  *********************/
75 
76 const float64_t SCALING=0.1; /* Factor to scale all SW parameters */
77 
78 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
79 /* If x=log(a) and y=log(b), compute log(a+b) : */
80 /*
81 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
82 */
83 
84 #define LOGP(x,y) LogSum(x,y)
85 
86 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
87 /*
88 #define LOGP(x,y) (((x)>(y))?(x):(y))
89 */
90 
91 /* Usefule constants */
92 const float64_t LOG0=-10000; /* log(0) */
93 const float64_t INTSCALE=1000.0; /* critical for speed and precise computation*/
94 
96 
98 : CStringKernel<char>(size)
99 {
100  SG_UNSTABLE("LocalAlignmentStringKernel");
101  init();
102  init_static_variables();
103 }
104 
107  float64_t opening, float64_t extension)
108 : CStringKernel<char>()
109 {
110  SG_UNSTABLE("LocalAlignmentStringKernel");
111  init();
112  m_opening=opening;
113  m_extension=extension;
114  init_static_variables();
115  init(l, r);
116 }
117 
119 {
120  cleanup();
121 }
122 
123 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
124 {
126  initialized=true;
127  return init_normalizer();
128 }
129 
131 {
133  scaled_blosum=NULL;
134 
135  SG_FREE(isAA);
136  isAA=NULL;
137  SG_FREE(aaIndex);
138  aaIndex=NULL;
139 
141 }
142 
143 /* LogSum - default log funciotion. fast, but not exact */
144 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */
145 
146 void CLocalAlignmentStringKernel::init_logsum(){
147  int32_t i;
148  for (i=0; i<LOGSUM_TBL; i++)
149  logsum_lookup[i]=int32_t(INTSCALE*
150  (log(1.+exp((float32_t)-i/INTSCALE))));
151 }
152 
153 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
154 {
155  int32_t diff;
156  static int32_t firsttime=1;
157 
158  if (firsttime)
159  {
160  init_logsum();
161  firsttime =0;
162  }
163 
164  diff=p1-p2;
165  if (diff>=LOGSUM_TBL)
166  return p1;
167  else if (diff<=-LOGSUM_TBL)
168  return p2;
169  else if (diff>0)
170  return p1+logsum_lookup[diff];
171  else
172  return p2+logsum_lookup[-diff];
173 }
174 
175 
176 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
177 {
178  if (p1>p2)
179  return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1));
180  else
181  return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2));
182 }
183 
184 
185 void CLocalAlignmentStringKernel::init_static_variables()
186  /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
187 {
188  register int32_t i;
189 
190  /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
191  if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t)))==NULL)
192  SG_ERROR("run out o memory");
193  for (i=0;i<NAA;i++)
194  aaIndex[aaList[i]-'A']=i;
195 
196  /* Initialization of the array which indicates whether a char is an amino-acid */
197  if ((isAA=(int32_t *)calloc(256,sizeof(int32_t)))==NULL)
198  SG_ERROR("run out of memory");
199  for (i=0;i<NAA;i++)
200  isAA[(int32_t)aaList[i]]=1;
201 
202  /* Scale the blossum matrix */
203  for (i=0 ; i<NAA*(NAA+1)/2; i++)
204  scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE);
205 
206 
207  /* Scale of gap penalties */
208  m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE);
209  m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE);
210 }
211 
212 
213 
214 /* Implementation of the
215  * convolution kernel which generalizes the Smith-Waterman algorithm
216  */
217 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
218  int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
219  int32_t nX, int32_t nY /* the lengths of both sequences */)
220 {
221  register int32_t
222  i,j, /* loop indexes */
223  cur, old, /* to indicate the array to use (0 or 1) */
224  curpos, frompos; /* position in an array */
225 
226  int32_t
227  *logX, /* arrays to store the log-values of each state */
228  *logY,
229  *logM,
230  *logX2,
231  *logY2;
232 
233  int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/
234  int32_t cl;/* length of a column for the dynamic programming */
235 
236  /*
237  printf("now computing pairHMM between %d and %d:\n",nX,nY);
238  for (i=0;i<nX;printf("%d ",aaX[i++]));
239  printf("\n and \n");
240  for (i=0;i<nY;printf("%d ",aaY[i++]));
241  printf("\n");
242  */
243 
244  /* Initialization of the arrays */
245  /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
246  cl=nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */
247 
248  logM=SG_CALLOC(int32_t, 2*cl);
249  logX=SG_CALLOC(int32_t, 2*cl);
250  logY=SG_CALLOC(int32_t, 2*cl);
251  logX2=SG_CALLOC(int32_t, 2*cl);
252  logY2=SG_CALLOC(int32_t, 2*cl);
253 
254  /************************************************/
255  /* First iteration : initialization of column 0 */
256  /************************************************/
257  /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
258 
259  for (j=0; j<cl; j++) {
260  logM[j]=LOG0;
261  logX[j]=LOG0;
262  logY[j]=LOG0;
263  logX2[j]=LOG0;
264  logY2[j]=LOG0;
265  }
266 
267  /* Update column order */
268  cur=1; /* Indexes [0..cl-1] are used to process the next column */
269  old=0; /* Indexes [cl..2*cl-1] were used for column 0 */
270 
271 
272  /************************************************/
273  /* Next iterations : processing columns 1 .. nX */
274  /************************************************/
275 
276  /* Main loop to vary the position in aaX : i=1..nX */
277  for (i=1; i<=nX; i++) {
278 
279  /* Special update for positions (i=1..nX,j=0) */
280  curpos=cur*cl; /* index of the state (i,0) */
281  logM[curpos]=LOG0;
282  logX[curpos]=LOG0;
283  logY[curpos]=LOG0;
284  logX2[curpos]=LOG0;
285  logY2[curpos]=LOG0;
286 
287  /* Secondary loop to vary the position in aaY : j=1..nY */
288  for (j=1; j<=nY; j++) {
289 
290  curpos=cur*cl+j; /* index of the state (i,j) */
291 
292  /* Update for states which emit X only */
293  /***************************************/
294 
295  frompos=old*cl+j; /* index of the state (i-1,j) */
296 
297  /* State RX */
298  logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]);
299  /* printf("%.5f\n",logX[curpos]);*/
300  /* printf("%.5f\n",logX_B[curpos]);*/
301  /* State RX2 */
302  logX2[curpos]=LOGP(logM[frompos], logX2[frompos]);
303 
304  /* Update for states which emit Y only */
305  /***************************************/
306 
307  frompos=cur*cl+j-1; /* index of the state (i,j-1) */
308 
309  /* State RY */
310  aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]);
311  logY[curpos] = LOGP(aux, -m_opening+logX[frompos]);
312 
313  /* State RY2 */
314  aux=LOGP(logM[frompos], logY2[frompos]);
315  logY2[curpos]=LOGP(aux, logX2[frompos]);
316 
317  /* Update for states which emit X and Y */
318  /****************************************/
319 
320  frompos=old*cl+j-1; /* index of the state (i-1,j-1) */
321 
322  aux=LOGP(logX[frompos], logY[frompos]);
323  aux2=LOGP(0, logM[frompos]);
324  logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])];
325 
326  /*
327  printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
328  */
329 
330  } /* end of j=1:nY loop */
331 
332 
333  /* Update the culumn order */
334  cur=1-cur;
335  old=1-old;
336 
337  } /* end of j=1:nX loop */
338 
339 
340  /* Termination */
341  /***************/
342 
343  curpos=old*cl+nY; /* index of the state (nX,nY) */
344  aux=LOGP(logX2[curpos], logY2[curpos]);
345  aux2=LOGP(0, logM[curpos]);
346  /* kernel_value = LOGP( aux , aux2 );*/
347 
348  /* Memory release */
349  SG_FREE(logM);
350  SG_FREE(logX);
351  SG_FREE(logY);
352  SG_FREE(logX2);
353  SG_FREE(logY2);
354 
355  /* Return the logarithm of the kernel */
356  return (float32_t)LOGP(aux,aux2)/INTSCALE;
357 }
358 
359 /********************/
360 /* Public functions */
361 /********************/
362 
363 
364 /* Return the log-probability of two sequences x and y under a pair HMM model */
365 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
366 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
367 {
368  int32_t *aax, *aay; /* to convert x and y into sequences of amino-acid indexes */
369  int32_t lx=0, ly=0; /* lengths of x and y */
370  int32_t i, j;
371 
372  bool free_x, free_y;
373  char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
374  char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
375  ASSERT(x && y);
376 
377  if ( (lx<1) || (ly<1) )
378  SG_ERROR("empty chain");
379 
380  /* Create aax and aay */
381 
382  if ((aax=(int32_t *)calloc(lx,sizeof(int32_t)))==NULL)
383  SG_ERROR("run out of memory");
384  if ((aay=(int32_t *)calloc(ly,sizeof(int32_t)))==NULL)
385  SG_ERROR("run out of memory");
386 
387  /* Extract the characters corresponding to aminoacids and keep their indexes */
388 
389  j=0;
390  for (i=0; i<lx; i++)
391  if (isAA[toupper(x[i])])
392  aax[j++]=aaIndex[toupper(x[i])-'A'];
393  lx=j;
394  j=0;
395  for (i=0; i<ly; i++)
396  if (isAA[toupper(y[i])])
397  aay[j++]=aaIndex[toupper(y[i])-'A'];
398  ly=j;
399 
400 
401  /* Compute the pair HMM score */
402  float64_t result=LAkernelcompute(aax, aay, lx, ly);
403 
404  /* Release memory */
405  SG_FREE(aax);
406  SG_FREE(aay);
407 
408  ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x);
409  ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y);
410 
411  return result;
412 }
413 
414 void CLocalAlignmentStringKernel::init()
415 {
416  initialized=false;
417  isAA=NULL;
418  aaIndex=NULL;
419 
420  m_opening=10;
421  m_extension=2;
422 
423  scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
424  init_logsum();
425 
426  SG_ADD(&initialized, "initialized", "If kernel is initalized.",
428  SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE);
429  SG_ADD(&m_extension, "extension", "Extension gap extension penalty.",
430  MS_AVAILABLE);
431 }

SHOGUN Machine Learning Toolbox - Documentation