SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LocalAlignmentStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * Compute the local alignment kernel
3  *
4  * Largely based on LAkernel.c (version 0.3)
5  *
6  * Copyright 2003 Jean-Philippe Vert
7  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
8  *
9  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
10  *
11  * Reference:
12  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
13  * detection using string alignment kernels", Bioinformatics,
14  * vol.20, p.1682-1689, 2004.
15  *
16  * This program is free software; you can redistribute it and/or modify
17  * it under the terms of the GNU General Public License as published by
18  * the Free Software Foundation; either version 3 of the License, or
19  * (at your option) any later version.
20  */
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <ctype.h>
25 #include <string.h>
29 
30 using namespace shogun;
31 
32 /****************/
33 /* The alphabet */
34 /****************/
35 
36 const int32_t NAA=20; /* Number of amino-acids */
37 const int32_t NLET=26; /* Number of letters in the alphabet */
38 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */
39 
40 /*****************/
41 /* SW parameters */
42 /*****************/
43 
44 /* mutation matrix */
45 const int32_t CLocalAlignmentStringKernel::blosum[] = {
46  6,
47  -2, 8,
48  -2, -1, 9,
49  -3, -2, 2, 9,
50  -1, -5, -4, -5, 13,
51  -1, 1, 0, 0, -4, 8,
52  -1, 0, 0, 2, -5, 3, 7,
53  0, -3, -1, -2, -4, -3, -3, 8,
54  -2, 0, 1, -2, -4, 1, 0, -3, 11,
55  -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
56  -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
57  -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
58  -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
59  -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
60  -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
61  2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
62  0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
63  -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
64  -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
65  0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
66 
67 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
68 static int32_t BINDEX(int32_t i, int32_t j)
69 {
70  return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2));
71 }
72 
73 /*********************
74  * Kernel parameters *
75  *********************/
76 
77 const float64_t SCALING=0.1; /* Factor to scale all SW parameters */
78 
79 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
80 /* If x=log(a) and y=log(b), compute log(a+b) : */
81 /*
82 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
83 */
84 
85 #define LOGP(x,y) LogSum(x,y)
86 
87 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
88 /*
89 #define LOGP(x,y) (((x)>(y))?(x):(y))
90 */
91 
92 /* Usefule constants */
93 const float64_t LOG0=-10000; /* log(0) */
94 const float64_t INTSCALE=1000.0; /* critical for speed and precise computation*/
95 
97 
99 : CStringKernel<char>(size)
100 {
101  init();
102  init_static_variables();
103 }
104 
107  float64_t opening, float64_t extension)
108 : CStringKernel<char>()
109 {
110  init();
111  m_opening=opening;
112  m_extension=extension;
113  init_static_variables();
114  init(l, r);
115 }
116 
118 {
119  cleanup();
120 }
121 
122 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
123 {
125  initialized=true;
126  return init_normalizer();
127 }
128 
130 {
131  SG_FREE(scaled_blosum);
132  scaled_blosum=NULL;
133 
134  SG_FREE(isAA);
135  isAA=NULL;
136  SG_FREE(aaIndex);
137  aaIndex=NULL;
138 
140 }
141 
142 /* LogSum - default log funciotion. fast, but not exact */
143 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */
144 
145 void CLocalAlignmentStringKernel::init_logsum(){
146  int32_t i;
147  for (i=0; i<LOGSUM_TBL; i++)
148  logsum_lookup[i]=int32_t(INTSCALE*
149  (log(1.+exp((float32_t)-i/INTSCALE))));
150 }
151 
152 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
153 {
154  int32_t diff;
155  static int32_t firsttime=1;
156 
157  if (firsttime)
158  {
159  init_logsum();
160  firsttime =0;
161  }
162 
163  diff=p1-p2;
164  if (diff>=LOGSUM_TBL)
165  return p1;
166  else if (diff<=-LOGSUM_TBL)
167  return p2;
168  else if (diff>0)
169  return p1+logsum_lookup[diff];
170  else
171  return p2+logsum_lookup[-diff];
172 }
173 
174 
175 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
176 {
177  if (p1>p2)
178  return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1));
179  else
180  return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2));
181 }
182 
183 
184 void CLocalAlignmentStringKernel::init_static_variables()
185  /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
186 {
187  register int32_t i;
188 
189  /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
190  aaIndex = SG_CALLOC(int32_t, NLET);
191  for (i=0;i<NAA;i++)
192  aaIndex[aaList[i]-'A']=i;
193 
194  /* Initialization of the array which indicates whether a char is an amino-acid */
195  isAA = SG_CALLOC(int32_t, 256);
196  for (i=0;i<NAA;i++)
197  isAA[(int32_t)aaList[i]]=1;
198 
199  /* Scale the blossum matrix */
200  for (i=0 ; i<NAA*(NAA+1)/2; i++)
201  scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE);
202 
203 
204  /* Scale of gap penalties */
205  m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE);
206  m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE);
207 }
208 
209 
210 
211 /* Implementation of the
212  * convolution kernel which generalizes the Smith-Waterman algorithm
213  */
214 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
215  int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
216  int32_t nX, int32_t nY /* the lengths of both sequences */)
217 {
218  register int32_t
219  i,j, /* loop indexes */
220  cur, old, /* to indicate the array to use (0 or 1) */
221  curpos, frompos; /* position in an array */
222 
223  int32_t
224  *logX, /* arrays to store the log-values of each state */
225  *logY,
226  *logM,
227  *logX2,
228  *logY2;
229 
230  int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/
231  int32_t cl;/* length of a column for the dynamic programming */
232 
233  /*
234  printf("now computing pairHMM between %d and %d:\n",nX,nY);
235  for (i=0;i<nX;printf("%d ",aaX[i++]));
236  printf("\n and \n");
237  for (i=0;i<nY;printf("%d ",aaY[i++]));
238  printf("\n");
239  */
240 
241  /* Initialization of the arrays */
242  /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
243  cl=nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */
244 
245  logM=SG_CALLOC(int32_t, 2*cl);
246  logX=SG_CALLOC(int32_t, 2*cl);
247  logY=SG_CALLOC(int32_t, 2*cl);
248  logX2=SG_CALLOC(int32_t, 2*cl);
249  logY2=SG_CALLOC(int32_t, 2*cl);
250 
251  /************************************************/
252  /* First iteration : initialization of column 0 */
253  /************************************************/
254  /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
255 
256  for (j=0; j<cl; j++) {
257  logM[j]=LOG0;
258  logX[j]=LOG0;
259  logY[j]=LOG0;
260  logX2[j]=LOG0;
261  logY2[j]=LOG0;
262  }
263 
264  /* Update column order */
265  cur=1; /* Indexes [0..cl-1] are used to process the next column */
266  old=0; /* Indexes [cl..2*cl-1] were used for column 0 */
267 
268 
269  /************************************************/
270  /* Next iterations : processing columns 1 .. nX */
271  /************************************************/
272 
273  /* Main loop to vary the position in aaX : i=1..nX */
274  for (i=1; i<=nX; i++) {
275 
276  /* Special update for positions (i=1..nX,j=0) */
277  curpos=cur*cl; /* index of the state (i,0) */
278  logM[curpos]=LOG0;
279  logX[curpos]=LOG0;
280  logY[curpos]=LOG0;
281  logX2[curpos]=LOG0;
282  logY2[curpos]=LOG0;
283 
284  /* Secondary loop to vary the position in aaY : j=1..nY */
285  for (j=1; j<=nY; j++) {
286 
287  curpos=cur*cl+j; /* index of the state (i,j) */
288 
289  /* Update for states which emit X only */
290  /***************************************/
291 
292  frompos=old*cl+j; /* index of the state (i-1,j) */
293 
294  /* State RX */
295  logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]);
296  /* printf("%.5f\n",logX[curpos]);*/
297  /* printf("%.5f\n",logX_B[curpos]);*/
298  /* State RX2 */
299  logX2[curpos]=LOGP(logM[frompos], logX2[frompos]);
300 
301  /* Update for states which emit Y only */
302  /***************************************/
303 
304  frompos=cur*cl+j-1; /* index of the state (i,j-1) */
305 
306  /* State RY */
307  aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]);
308  logY[curpos] = LOGP(aux, -m_opening+logX[frompos]);
309 
310  /* State RY2 */
311  aux=LOGP(logM[frompos], logY2[frompos]);
312  logY2[curpos]=LOGP(aux, logX2[frompos]);
313 
314  /* Update for states which emit X and Y */
315  /****************************************/
316 
317  frompos=old*cl+j-1; /* index of the state (i-1,j-1) */
318 
319  aux=LOGP(logX[frompos], logY[frompos]);
320  aux2=LOGP(0, logM[frompos]);
321  logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])];
322 
323  /*
324  printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
325  */
326 
327  } /* end of j=1:nY loop */
328 
329 
330  /* Update the culumn order */
331  cur=1-cur;
332  old=1-old;
333 
334  } /* end of j=1:nX loop */
335 
336 
337  /* Termination */
338  /***************/
339 
340  curpos=old*cl+nY; /* index of the state (nX,nY) */
341  aux=LOGP(logX2[curpos], logY2[curpos]);
342  aux2=LOGP(0, logM[curpos]);
343  /* kernel_value = LOGP( aux , aux2 );*/
344 
345  /* Memory release */
346  SG_FREE(logM);
347  SG_FREE(logX);
348  SG_FREE(logY);
349  SG_FREE(logX2);
350  SG_FREE(logY2);
351 
352  /* Return the logarithm of the kernel */
353  return (float32_t)LOGP(aux,aux2)/INTSCALE;
354 }
355 
356 /********************/
357 /* Public functions */
358 /********************/
359 
360 
361 /* Return the log-probability of two sequences x and y under a pair HMM model */
362 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
363 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
364 {
365  int32_t *aax, *aay; /* to convert x and y into sequences of amino-acid indexes */
366  int32_t lx=0, ly=0; /* lengths of x and y */
367  int32_t i, j;
368 
369  bool free_x, free_y;
370  char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
371  char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
372  ASSERT(x && y)
373 
374  if ( (lx<1) || (ly<1) )
375  SG_ERROR("empty chain")
376 
377  /* Create aax and aay */
378 
379  aax = SG_CALLOC(int32_t, lx);
380  aay = SG_CALLOC(int32_t, ly);
381 
382  /* Extract the characters corresponding to aminoacids and keep their indexes */
383 
384  j=0;
385  for (i=0; i<lx; i++)
386  if (isAA[toupper(x[i])])
387  aax[j++]=aaIndex[toupper(x[i])-'A'];
388  lx=j;
389  j=0;
390  for (i=0; i<ly; i++)
391  if (isAA[toupper(y[i])])
392  aay[j++]=aaIndex[toupper(y[i])-'A'];
393  ly=j;
394 
395 
396  /* Compute the pair HMM score */
397  float64_t result=LAkernelcompute(aax, aay, lx, ly);
398 
399  /* Release memory */
400  SG_FREE(aax);
401  SG_FREE(aay);
402 
403  ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x);
404  ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y);
405 
406  return result;
407 }
408 
409 void CLocalAlignmentStringKernel::init()
410 {
412 
413  initialized=false;
414  isAA=NULL;
415  aaIndex=NULL;
416 
417  m_opening=10;
418  m_extension=2;
419 
420  scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
421  init_logsum();
422 
423  SG_ADD(&initialized, "initialized", "If kernel is initalized.",
425  SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE);
426  SG_ADD(&m_extension, "extension", "Extension gap extension penalty.",
427  MS_AVAILABLE);
428 }
virtual void cleanup()
Definition: Kernel.cpp:173
static int32_t BINDEX(int32_t i, int32_t j)
const float64_t INTSCALE
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:150
#define SG_ERROR(...)
Definition: SGIO.h:129
const int32_t NAA
const int32_t NLET
#define ASSERT(x)
Definition: SGIO.h:201
const int32_t LOGSUM_TBL
double float64_t
Definition: common.h:50
virtual bool init_normalizer()
Definition: Kernel.cpp:168
float float32_t
Definition: common.h:49
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1062
virtual float64_t compute(int32_t idx_a, int32_t idx_b)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual bool init(CFeatures *l, CFeatures *r)
const float64_t SCALING
const float64_t LOG0
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1060
#define LOGP(x, y)
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_ADD(...)
Definition: SGObject.h:84
friend class CSqrtDiagKernelNormalizer
Definition: Kernel.h:162
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26

SHOGUN Machine Learning Toolbox - Documentation