SHOGUN: LocalAlignmentStringKernel.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * Compute the local alignment kernel
00003  *
00004  * Largely based on LAkernel.c (version 0.3)
00005  *
00006  * Copyright 2003 Jean-Philippe Vert
00007  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
00008  *
00009  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
00010  *
00011  * Reference:
00012  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
00013  * detection using string alignment kernels", Bioinformatics,
00014  * vol.20, p.1682-1689, 2004.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 3 of the License, or
00019  * (at your option) any later version.
00020  */
00021 
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <shogun/kernel/string/LocalAlignmentStringKernel.h>
00028 
00029 using namespace shogun;
00030 
00031 /****************/
00032 /* The alphabet */
00033 /****************/
00034 
00035 const int32_t NAA=20;                                  /* Number of amino-acids */
00036 const int32_t NLET=26;                                 /* Number of letters in the alphabet */
00037 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV";    /* The list of amino acids */
00038 
00039 /*****************/
00040 /* SW parameters */
00041 /*****************/
00042 
00043 /* mutation matrix */
00044 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00045   6,
00046  -2,   8,
00047  -2,  -1,   9,
00048   -3,  -2,   2,   9,
00049   -1,  -5,  -4,  -5,  13,
00050   -1,   1,   0,   0,  -4,   8,
00051   -1,   0,   0,   2,  -5,   3,   7,
00052   0,  -3,  -1,  -2,  -4,  -3,  -3,   8,
00053   -2,   0,   1,  -2,  -4,   1,   0,  -3,  11,
00054   -2,  -5,  -5,  -5,  -2,  -4,  -5,  -6,  -5,   6,
00055   -2,  -3,  -5,  -5,  -2,  -3,  -4,  -5,  -4,   2,   6,
00056   -1,   3,   0,  -1,  -5,   2,   1,  -2,  -1,  -4,  -4,   7,
00057   -1,  -2,  -3,  -5,  -2,  -1,  -3,  -4,  -2,   2,   3,  -2,   8,
00058   -3,  -4,  -5,  -5,  -4,  -5,  -5,  -5,  -2,   0,   1,  -5,   0,   9,
00059   -1,  -3,  -3,  -2,  -4,  -2,  -2,  -3,  -3,  -4,  -4,  -2,  -4,  -5,  11,
00060   2,  -1,   1,   0,  -1,   0,   0,   0,  -1,  -4,  -4,   0,  -2,  -4,  -1,   6,
00061   0,  -2,   0,  -2,  -1,  -1,  -1,  -2,  -3,  -1,  -2,  -1,  -1,  -3,  -2,   2,   7,
00062   -4,  -4,  -6,  -6,  -3,  -3,  -4,  -4,  -4,  -4,  -2,  -4,  -2,   1,  -6,  -4,  -4,  16,
00063   -3,  -3,  -3,  -5,  -4,  -2,  -3,  -5,   3,  -2,  -2,  -3,  -1,   4,  -4,  -3,  -2,   3,  10,
00064   0,  -4,  -4,  -5,  -1,  -3,  -4,  -5,  -5,   4,   1,  -3,   1,  -1,  -4,  -2,   0,  -4,  -2,   6};
00065 
00066 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
00067 static int32_t BINDEX(int32_t i, int32_t j)
00068 {
00069     return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2));
00070 }
00071 
00072 /*********************
00073  * Kernel parameters *
00074  *********************/
00075 
00076 const float64_t SCALING=0.1;           /* Factor to scale all SW parameters */
00077 
00078 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
00079 /* If x=log(a) and y=log(b), compute log(a+b) : */
00080 /*
00081 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
00082 */
00083 
00084 #define LOGP(x,y) LogSum(x,y)
00085 
00086 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
00087 /*
00088 #define LOGP(x,y) (((x)>(y))?(x):(y))
00089 */
00090 
00091 /* Usefule constants */
00092 const float64_t LOG0=-10000;          /* log(0) */
00093 const float64_t INTSCALE=1000.0;      /* critical for speed and precise computation*/
00094 
00095 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00096 
00097 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00098 : CStringKernel<char>(size)
00099 {
00100     SG_UNSTABLE("LocalAlignmentStringKernel");
00101     init();
00102     init_static_variables();
00103 }
00104 
00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00106     CStringFeatures<char>* l, CStringFeatures<char>* r,
00107     float64_t opening, float64_t extension)
00108 : CStringKernel<char>()
00109 {
00110     SG_UNSTABLE("LocalAlignmentStringKernel");
00111     init();
00112     m_opening=opening;
00113     m_extension=extension;
00114     init_static_variables();
00115     init(l, r);
00116 }
00117 
00118 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00119 {
00120     cleanup();
00121 }
00122 
00123 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00124 {
00125     CStringKernel<char>::init(l, r);
00126     initialized=true;
00127     return init_normalizer();
00128 }
00129 
00130 void CLocalAlignmentStringKernel::cleanup()
00131 {
00132     SG_FREE(scaled_blosum);
00133     scaled_blosum=NULL;
00134 
00135     SG_FREE(isAA);
00136     isAA=NULL;
00137     SG_FREE(aaIndex);
00138     aaIndex=NULL;
00139 
00140     CKernel::cleanup();
00141 }
00142 
00143 /* LogSum - default log funciotion. fast, but not exact */
00144 /* LogSum2 - precise, but slow. Note that these two functions need different figure types  */
00145 
00146 void CLocalAlignmentStringKernel::init_logsum(){
00147     int32_t i;
00148     for (i=0; i<LOGSUM_TBL; i++)
00149         logsum_lookup[i]=int32_t(INTSCALE*
00150                 (log(1.+exp((float32_t)-i/INTSCALE))));
00151 }
00152 
00153 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00154 {
00155     int32_t diff;
00156     static int32_t firsttime=1;
00157 
00158     if (firsttime)
00159     {
00160         init_logsum();
00161         firsttime =0;
00162     }
00163 
00164     diff=p1-p2;
00165     if (diff>=LOGSUM_TBL)
00166         return p1;
00167     else if (diff<=-LOGSUM_TBL)
00168         return p2;
00169     else if (diff>0)
00170         return p1+logsum_lookup[diff];
00171     else
00172         return p2+logsum_lookup[-diff];
00173 }
00174 
00175 
00176 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00177 {
00178     if (p1>p2)
00179         return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1));
00180     else
00181         return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2));
00182 }
00183 
00184 
00185 void CLocalAlignmentStringKernel::init_static_variables()
00186      /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
00187 {
00188     register int32_t i;
00189 
00190     /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
00191     aaIndex = SG_CALLOC(int32_t, NLET);
00192     for (i=0;i<NAA;i++)
00193         aaIndex[aaList[i]-'A']=i;
00194 
00195     /* Initialization of the array which indicates whether a char is an amino-acid */
00196     isAA = SG_CALLOC(int32_t, 256);
00197     for (i=0;i<NAA;i++)
00198         isAA[(int32_t)aaList[i]]=1;
00199 
00200     /* Scale the blossum matrix */
00201     for (i=0 ; i<NAA*(NAA+1)/2; i++)
00202         scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE);
00203 
00204 
00205     /* Scale of gap penalties */
00206     m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE);
00207     m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE);
00208 }
00209 
00210 
00211 
00212 /* Implementation of the
00213  * convolution kernel which generalizes the Smith-Waterman algorithm
00214  */
00215 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00216     int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
00217     int32_t nX, int32_t nY /* the lengths of both sequences */)
00218 {
00219     register int32_t
00220     i,j,                /* loop indexes */
00221     cur, old,           /* to indicate the array to use (0 or 1) */
00222     curpos, frompos;    /* position in an array */
00223 
00224     int32_t
00225     *logX,           /* arrays to store the log-values of each state */
00226     *logY,
00227     *logM,
00228     *logX2,
00229     *logY2;
00230 
00231     int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/
00232     int32_t cl;/* length of a column for the dynamic programming */
00233 
00234     /*
00235     printf("now computing pairHMM between %d and %d:\n",nX,nY);
00236     for (i=0;i<nX;printf("%d ",aaX[i++]));
00237     printf("\n and \n");
00238     for (i=0;i<nY;printf("%d ",aaY[i++]));
00239     printf("\n");
00240     */
00241 
00242     /* Initialization of the arrays */
00243     /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
00244     cl=nY+1;           /* each column stores the positions in the aaY sequence, plus a position at zero */
00245 
00246     logM=SG_CALLOC(int32_t, 2*cl);
00247     logX=SG_CALLOC(int32_t, 2*cl);
00248     logY=SG_CALLOC(int32_t, 2*cl);
00249     logX2=SG_CALLOC(int32_t, 2*cl);
00250     logY2=SG_CALLOC(int32_t, 2*cl);
00251 
00252     /************************************************/
00253     /* First iteration : initialization of column 0 */
00254     /************************************************/
00255     /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
00256 
00257     for (j=0; j<cl; j++) {
00258         logM[j]=LOG0;
00259         logX[j]=LOG0;
00260         logY[j]=LOG0;
00261         logX2[j]=LOG0;
00262         logY2[j]=LOG0;
00263     }
00264 
00265     /* Update column order */
00266     cur=1;      /* Indexes [0..cl-1] are used to process the next column */
00267     old=0;      /* Indexes [cl..2*cl-1] were used for column 0 */
00268 
00269 
00270     /************************************************/
00271     /* Next iterations : processing columns 1 .. nX */
00272     /************************************************/
00273 
00274     /* Main loop to vary the position in aaX : i=1..nX */
00275     for (i=1; i<=nX; i++) {
00276 
00277         /* Special update for positions (i=1..nX,j=0) */
00278         curpos=cur*cl;                  /* index of the state (i,0) */
00279         logM[curpos]=LOG0;
00280         logX[curpos]=LOG0;
00281         logY[curpos]=LOG0;
00282         logX2[curpos]=LOG0;
00283         logY2[curpos]=LOG0;
00284 
00285         /* Secondary loop to vary the position in aaY : j=1..nY */
00286         for (j=1; j<=nY; j++) {
00287 
00288             curpos=cur*cl+j;            /* index of the state (i,j) */
00289 
00290             /* Update for states which emit X only */
00291             /***************************************/
00292 
00293             frompos=old*cl+j;            /* index of the state (i-1,j) */
00294 
00295             /* State RX */
00296             logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]);
00297             /*      printf("%.5f\n",logX[curpos]);*/
00298             /*      printf("%.5f\n",logX_B[curpos]);*/
00299             /* State RX2 */
00300             logX2[curpos]=LOGP(logM[frompos], logX2[frompos]);
00301 
00302             /* Update for states which emit Y only */
00303             /***************************************/
00304 
00305             frompos=cur*cl+j-1;          /* index of the state (i,j-1) */
00306 
00307             /* State RY */
00308             aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]);
00309             logY[curpos] = LOGP(aux, -m_opening+logX[frompos]);
00310 
00311             /* State RY2 */
00312             aux=LOGP(logM[frompos], logY2[frompos]);
00313             logY2[curpos]=LOGP(aux, logX2[frompos]);
00314 
00315             /* Update for states which emit X and Y */
00316             /****************************************/
00317 
00318             frompos=old*cl+j-1;          /* index of the state (i-1,j-1) */
00319 
00320             aux=LOGP(logX[frompos], logY[frompos]);
00321             aux2=LOGP(0, logM[frompos]);
00322             logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])];
00323 
00324             /*
00325             printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
00326             */
00327 
00328         }  /* end of j=1:nY loop */
00329 
00330 
00331         /* Update the culumn order */
00332         cur=1-cur;
00333         old=1-old;
00334 
00335     }  /* end of j=1:nX loop */
00336 
00337 
00338     /* Termination */
00339     /***************/
00340 
00341     curpos=old*cl+nY;                /* index of the state (nX,nY) */
00342     aux=LOGP(logX2[curpos], logY2[curpos]);
00343     aux2=LOGP(0, logM[curpos]);
00344     /*  kernel_value = LOGP( aux , aux2 );*/
00345 
00346     /* Memory release */
00347     SG_FREE(logM);
00348     SG_FREE(logX);
00349     SG_FREE(logY);
00350     SG_FREE(logX2);
00351     SG_FREE(logY2);
00352 
00353     /* Return the logarithm of the kernel */
00354     return (float32_t)LOGP(aux,aux2)/INTSCALE;
00355 }
00356 
00357 /********************/
00358 /* Public functions */
00359 /********************/
00360 
00361 
00362 /* Return the log-probability of two sequences x and y under a pair HMM model */
00363 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
00364 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00365 {
00366     int32_t *aax, *aay;  /* to convert x and y into sequences of amino-acid indexes */
00367     int32_t lx=0, ly=0;       /* lengths of x and y */
00368     int32_t i, j;
00369 
00370     bool free_x, free_y;
00371     char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00372     char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00373     ASSERT(x && y);
00374 
00375     if ( (lx<1) || (ly<1) )
00376         SG_ERROR("empty chain");
00377 
00378     /* Create aax and aay */
00379 
00380     aax = SG_CALLOC(int32_t, lx);
00381     aay = SG_CALLOC(int32_t, ly);
00382 
00383     /* Extract the characters corresponding to aminoacids and keep their indexes */
00384 
00385     j=0;
00386     for (i=0; i<lx; i++)
00387         if (isAA[toupper(x[i])])
00388             aax[j++]=aaIndex[toupper(x[i])-'A'];
00389     lx=j;
00390     j=0;
00391     for (i=0; i<ly; i++)
00392         if (isAA[toupper(y[i])])
00393             aay[j++]=aaIndex[toupper(y[i])-'A'];
00394     ly=j;
00395 
00396 
00397     /* Compute the pair HMM score */
00398     float64_t result=LAkernelcompute(aax, aay, lx, ly);
00399 
00400     /* Release memory */
00401     SG_FREE(aax);
00402     SG_FREE(aay);
00403 
00404     ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x);
00405     ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y);
00406 
00407     return result;
00408 }
00409 
00410 void CLocalAlignmentStringKernel::init()
00411 {
00412     initialized=false;
00413     isAA=NULL;
00414     aaIndex=NULL;
00415 
00416     m_opening=10;
00417     m_extension=2;
00418 
00419     scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
00420     init_logsum();
00421 
00422     SG_ADD(&initialized, "initialized", "If kernel is initalized.",
00423         MS_NOT_AVAILABLE);
00424     SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE);
00425     SG_ADD(&m_extension, "extension", "Extension gap extension penalty.",
00426         MS_AVAILABLE);
00427 }