LocalAlignmentStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * Compute the local alignment kernel
00003  *
00004  * Largely based on LAkernel.c (version 0.3)
00005  *
00006  * Copyright 2003 Jean-Philippe Vert
00007  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
00008  *
00009  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
00010  * 
00011  * Reference:
00012  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
00013  * detection using string alignment kernels", Bioinformatics,
00014  * vol.20, p.1682-1689, 2004.
00015  * 
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 3 of the License, or
00019  * (at your option) any later version.
00020  */
00021 
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <shogun/kernel/LocalAlignmentStringKernel.h>
00028 
00029 using namespace shogun;
00030 
00031 /****************/
00032 /* The alphabet */
00033 /****************/
00034 
00035 #define NAA 20                                  /* Number of amino-acids */
00036 #define NLET 26                                 /* Number of letters in the alphabet */
00037 const char* CLocalAlignmentStringKernel::aaList= "ARNDCQEGHILKMFPSTWYV";    /* The list of amino acids */
00038 
00039 /*****************/
00040 /* SW parameters */
00041 /*****************/
00042 
00043 /* mutation matrix */
00044 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00045   6,
00046  -2,   8,
00047  -2,  -1,   9,
00048   -3,  -2,   2,   9,
00049   -1,  -5,  -4,  -5,  13,
00050   -1,   1,   0,   0,  -4,   8,
00051   -1,   0,   0,   2,  -5,   3,   7,
00052   0,  -3,  -1,  -2,  -4,  -3,  -3,   8,
00053   -2,   0,   1,  -2,  -4,   1,   0,  -3,  11,
00054   -2,  -5,  -5,  -5,  -2,  -4,  -5,  -6,  -5,   6,
00055   -2,  -3,  -5,  -5,  -2,  -3,  -4,  -5,  -4,   2,   6,
00056   -1,   3,   0,  -1,  -5,   2,   1,  -2,  -1,  -4,  -4,   7,
00057   -1,  -2,  -3,  -5,  -2,  -1,  -3,  -4,  -2,   2,   3,  -2,   8,
00058   -3,  -4,  -5,  -5,  -4,  -5,  -5,  -5,  -2,   0,   1,  -5,   0,   9,
00059   -1,  -3,  -3,  -2,  -4,  -2,  -2,  -3,  -3,  -4,  -4,  -2,  -4,  -5,  11,
00060   2,  -1,   1,   0,  -1,   0,   0,   0,  -1,  -4,  -4,   0,  -2,  -4,  -1,   6,
00061   0,  -2,   0,  -2,  -1,  -1,  -1,  -2,  -3,  -1,  -2,  -1,  -1,  -3,  -2,   2,   7,
00062   -4,  -4,  -6,  -6,  -3,  -3,  -4,  -4,  -4,  -4,  -2,  -4,  -2,   1,  -6,  -4,  -4,  16,
00063   -3,  -3,  -3,  -5,  -4,  -2,  -3,  -5,   3,  -2,  -2,  -3,  -1,   4,  -4,  -3,  -2,   3,  10,
00064   0,  -4,  -4,  -5,  -1,  -3,  -4,  -5,  -5,   4,   1,  -3,   1,  -1,  -4,  -2,   0,  -4,  -2,   6};
00065 
00066 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
00067 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00068 
00069 /*********************
00070  * Kernel parameters *
00071  *********************/
00072 
00073 #define SCALING 0.1           /* Factor to scale all SW parameters */
00074 
00075 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
00076 /* If x=log(a) and y=log(b), compute log(a+b) : */
00077 /*
00078 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
00079 */
00080 
00081 #define LOGP(x,y) LogSum(x,y)
00082 
00083 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
00084 /*
00085 #define LOGP(x,y) (((x)>(y))?(x):(y))
00086 */
00087 
00088 /* Usefule constants */
00089 #define LOG0 -10000          /* log(0) */
00090 #define INTSCALE 1000.0      /* critical for speed and precise computation*/
00091 
00092 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00093 
00094 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00095 : CStringKernel<char>(size)
00096 {
00097     init();
00098     init_static_variables();
00099 }
00100 
00101 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00102     CStringFeatures<char>* l, CStringFeatures<char>* r,
00103     float64_t opening, float64_t extension)
00104 : CStringKernel<char>()
00105 {
00106     init();
00107     m_opening=opening;
00108     m_extension=extension;
00109     init_static_variables();
00110     init(l, r);
00111 }
00112 
00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00114 {
00115     cleanup();
00116 }
00117 
00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00119 {
00120     CStringKernel<char>::init(l, r);
00121     initialized = true;
00122     return init_normalizer();
00123 }
00124 
00125 void CLocalAlignmentStringKernel::cleanup()
00126 {
00127     SG_FREE(scaled_blosum);
00128     scaled_blosum=NULL;
00129 
00130     SG_FREE(isAA);
00131     isAA=NULL;
00132     SG_FREE(aaIndex);
00133     aaIndex=NULL;
00134 
00135     CKernel::cleanup();
00136 }
00137 
00138 /* LogSum - default log funciotion. fast, but not exact */
00139 /* LogSum2 - precise, but slow. Note that these two functions need different figure types  */
00140 
00141 void CLocalAlignmentStringKernel::init_logsum(void){
00142   int32_t i;
00143   for (i = 0; i < LOGSUM_TBL; i++) 
00144     logsum_lookup[i] = (int32_t) (INTSCALE*
00145                    (log(1.+exp( (float32_t) -i/INTSCALE))));
00146 }
00147 
00148 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00149 {
00150     int32_t diff;
00151     static int32_t firsttime=1;
00152 
00153     if (firsttime)
00154     {
00155         init_logsum();
00156         firsttime =0;
00157     }
00158 
00159     diff=p1-p2;
00160     if (diff>=LOGSUM_TBL) return p1;
00161     else if (diff<=-LOGSUM_TBL) return p2;
00162     else if (diff>0) return p1+logsum_lookup[diff];
00163     else return p2+logsum_lookup[-diff];
00164 }
00165 
00166 
00167 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00168 {
00169   if (p1 > p2)
00170     return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00171   else
00172     return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00173 }
00174 
00175 
00176 void CLocalAlignmentStringKernel::init_static_variables()
00177      /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
00178 {
00179   register int32_t i;
00180 
00181   /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
00182   if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL)
00183     SG_ERROR("run out o memory");
00184   for (i=0;i<NAA;i++) 
00185     aaIndex[aaList[i]-'A']=i;
00186   
00187   /* Initialization of the array which indicates whether a char is an amino-acid */
00188   if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL)
00189     SG_ERROR("run out of memory");
00190   for (i=0;i<NAA;i++) 
00191     isAA[(int32_t)aaList[i]]=1;
00192 
00193   /* Scale the blossum matrix */
00194   for (i=0 ; i<NAA*(NAA+1)/2; i++)
00195       scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE);
00196 
00197 
00198   /* Scale of gap penalties */
00199   m_opening = (int32_t) floor(m_opening * SCALING*INTSCALE);
00200   m_extension = (int32_t) floor(m_extension * SCALING*INTSCALE);
00201 }
00202 
00203 
00204 
00205 /* Implementation of the
00206  * convolution kernel which generalizes the Smith-Waterman algorithm
00207  */
00208 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00209     int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
00210     int32_t nX, int32_t nY /* the lengths of both sequences */)
00211 {
00212    register int32_t
00213     i,j,                /* loop indexes */
00214     cur, old,           /* to indicate the array to use (0 or 1) */
00215     curpos, frompos;    /* position in an array */
00216 
00217    int32_t
00218     *logX,           /* arrays to store the log-values of each state */
00219     *logY,
00220     *logM,
00221     *logX2,
00222     *logY2,
00223 
00224     aux , aux2;/* , aux3 , aux4 , aux5;*/
00225   int32_t
00226     cl;                /* length of a column for the dynamic programming */
00227 
00228   /*
00229   printf("now computing pairHMM between %d and %d:\n",nX,nY);
00230   for (i=0;i<nX;printf("%d ",aaX[i++]));
00231   printf("\n and \n");
00232   for (i=0;i<nY;printf("%d ",aaY[i++]));
00233   printf("\n");
00234   */
00235 
00236   /* Initialization of the arrays */
00237   /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
00238   cl = nY+1;           /* each column stores the positions in the aaY sequence, plus a position at zero */
00239 
00240   logM=SG_MALLOC(int32_t, 2*cl);
00241   logX=SG_MALLOC(int32_t, 2*cl);
00242   logY=SG_MALLOC(int32_t, 2*cl);
00243   logX2=SG_MALLOC(int32_t, 2*cl);
00244   logY2=SG_MALLOC(int32_t, 2*cl);
00245 
00246   /************************************************/
00247   /* First iteration : initialization of column 0 */
00248   /************************************************/
00249   /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
00250 
00251   for (j=0;j<cl;j++) {
00252     logM[j]=LOG0;
00253     logX[j]=LOG0;
00254     logY[j]=LOG0;
00255     logX2[j]=LOG0;
00256     logY2[j]=LOG0;
00257 
00258   }
00259 
00260   /* Update column order */
00261   cur = 1;      /* Indexes [0..cl-1] are used to process the next column */
00262   old = 0;      /* Indexes [cl..2*cl-1] were used for column 0 */
00263 
00264 
00265   /************************************************/
00266   /* Next iterations : processing columns 1 .. nX */
00267   /************************************************/
00268 
00269   /* Main loop to vary the position in aaX : i=1..nX */
00270   for (i=1;i<=nX;i++) {
00271 
00272     /* Special update for positions (i=1..nX,j=0) */
00273     curpos = cur*cl;                  /* index of the state (i,0) */
00274     logM[curpos] = LOG0; 
00275     logX[curpos] = LOG0; 
00276     logY[curpos] = LOG0; 
00277     logX2[curpos] = LOG0; 
00278     logY2[curpos] = LOG0; 
00279 
00280     /* Secondary loop to vary the position in aaY : j=1..nY */
00281     for (j=1;j<=nY;j++) {
00282 
00283       curpos = cur*cl + j;            /* index of the state (i,j) */
00284 
00285       /* Update for states which emit X only */
00286       /***************************************/
00287 
00288       frompos = old*cl + j;            /* index of the state (i-1,j) */
00289       
00290       /* State RX */
00291       logX[curpos] = LOGP( - m_opening + logM[frompos] , - m_extension + logX[frompos] );
00292       /*      printf("%.5f\n",logX[curpos]);*/
00293       /*      printf("%.5f\n",logX_B[curpos]);*/
00294       /* State RX2 */
00295       logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00296 
00297       /* Update for states which emit Y only */
00298       /***************************************/
00299 
00300       frompos = cur*cl + j-1;          /* index of the state (i,j-1) */
00301 
00302       /* State RY */
00303       aux = LOGP( - m_opening + logM[frompos] , - m_extension + logY[frompos] );
00304       logY[curpos] = LOGP( aux , - m_opening + logX[frompos] );
00305 
00306       /* State RY2 */
00307       aux = LOGP( logM[frompos] , logY2[frompos] );
00308       logY2[curpos] = LOGP( aux , logX2[frompos] );
00309 
00310       /* Update for states which emit X and Y */
00311       /****************************************/
00312 
00313       frompos = old*cl + j-1;          /* index of the state (i-1,j-1) */
00314 
00315       aux = LOGP( logX[frompos] , logY[frompos] );
00316       aux2 = LOGP( 0 , logM[frompos] );
00317       logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00318       
00319       /*      printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
00320        */
00321 
00322     }  /* end of j=1:nY loop */
00323 
00324 
00325     /* Update the culumn order */
00326     cur = 1-cur;
00327     old = 1-old;
00328 
00329   }  /* end of j=1:nX loop */
00330 
00331 
00332   /* Termination */
00333   /***************/
00334 
00335   curpos = old*cl + nY;                /* index of the state (nX,nY) */
00336   aux = LOGP( logX2[curpos] , logY2[curpos] );
00337   aux2 = LOGP( 0 , logM[curpos] );
00338   /*  kernel_value = LOGP( aux , aux2 );*/
00339 
00340   /* Memory release */
00341     SG_FREE(logM);
00342     SG_FREE(logX);
00343     SG_FREE(logY);
00344     SG_FREE(logX2);
00345     SG_FREE(logY2);
00346 
00347   /* Return the logarithm of the kernel */
00348   return (float32_t)LOGP(aux,aux2)/INTSCALE;
00349 }
00350 
00351 /********************/
00352 /* Public functions */
00353 /********************/
00354 
00355 
00356 /* Return the log-probability of two sequences x and y under a pair HMM model */
00357 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
00358 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00359 {
00360   int32_t *aax,*aay;  /* to convert x and y into sequences of amino-acid indexes */
00361   int32_t lx=0,ly=0;       /* lengths of x and y */
00362   int32_t i,j;
00363 
00364   bool free_x, free_y;
00365   char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00366   char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00367   ASSERT(x && y);
00368 
00369   if ((lx<1) || (ly<1))
00370     SG_ERROR("empty chain");
00371 
00372   /* Create aax and aay */
00373 
00374   if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL)
00375     SG_ERROR("run out of memory");
00376   if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL)
00377     SG_ERROR("run out of memory");
00378 
00379   /* Extract the characters corresponding to aminoacids and keep their indexes */
00380 
00381   j=0;
00382   for (i=0 ; i<lx ; i++) 
00383     if (isAA[toupper(x[i])])
00384       aax[j++] = aaIndex[toupper(x[i])-'A'];
00385   lx = j;
00386   j=0;
00387   for (i=0 ; i<ly ; i++)
00388     if (isAA[toupper(y[i])])
00389       aay[j++] = aaIndex[toupper(y[i])-'A'];
00390   ly = j;
00391 
00392 
00393   /* Compute the pair HMM score */
00394   float64_t result=LAkernelcompute(aax,aay,lx,ly);
00395 
00396   /* Release memory */
00397   SG_FREE(aax);
00398   SG_FREE(aay);
00399 
00400   ((CStringFeatures<char>*) lhs)->free_feature_vector(x, idx_x, free_x);
00401   ((CStringFeatures<char>*) rhs)->free_feature_vector(y, idx_y, free_y);
00402 
00403   return result;
00404 }
00405 
00406 void CLocalAlignmentStringKernel::init()
00407 {
00408     initialized = false;
00409     isAA = NULL;
00410     aaIndex = NULL;
00411 
00412     m_opening = 10;
00413     m_extension = 2;
00414 
00415     scaled_blosum=SG_MALLOC(int32_t, sizeof(blosum));
00416     init_logsum();
00417 
00418     m_parameters->add(&initialized, "initialized", "if kernel is initalized");
00419     m_parameters->add(&m_opening, "opening", "opening gap opening penalty");
00420     m_parameters->add(&m_extension, "extension", "extension gap extension penalty");
00421 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation