00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <shogun/kernel/LocalAlignmentStringKernel.h>
00028
00029 using namespace shogun;
00030
00031
00032
00033
00034
00035 #define NAA 20
00036 #define NLET 26
00037 const char* CLocalAlignmentStringKernel::aaList= "ARNDCQEGHILKMFPSTWYV";
00038
00039
00040
00041
00042
00043
00044 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00045 6,
00046 -2, 8,
00047 -2, -1, 9,
00048 -3, -2, 2, 9,
00049 -1, -5, -4, -5, 13,
00050 -1, 1, 0, 0, -4, 8,
00051 -1, 0, 0, 2, -5, 3, 7,
00052 0, -3, -1, -2, -4, -3, -3, 8,
00053 -2, 0, 1, -2, -4, 1, 0, -3, 11,
00054 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
00055 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
00056 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
00057 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
00058 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
00059 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
00060 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
00061 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
00062 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
00063 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
00064 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
00065
00066
00067 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00068
00069
00070
00071
00072
00073 #define SCALING 0.1
00074
00075
00076
00077
00078
00079
00080
00081 #define LOGP(x,y) LogSum(x,y)
00082
00083
00084
00085
00086
00087
00088
00089 #define LOG0 -10000
00090 #define INTSCALE 1000.0
00091
00092 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00093
00094 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00095 : CStringKernel<char>(size)
00096 {
00097 init();
00098 init_static_variables();
00099 }
00100
00101 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00102 CStringFeatures<char>* l, CStringFeatures<char>* r,
00103 float64_t opening, float64_t extension)
00104 : CStringKernel<char>()
00105 {
00106 init();
00107 m_opening=opening;
00108 m_extension=extension;
00109 init_static_variables();
00110 init(l, r);
00111 }
00112
00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00114 {
00115 cleanup();
00116 }
00117
00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00119 {
00120 CStringKernel<char>::init(l, r);
00121 initialized = true;
00122 return init_normalizer();
00123 }
00124
00125 void CLocalAlignmentStringKernel::cleanup()
00126 {
00127 SG_FREE(scaled_blosum);
00128 scaled_blosum=NULL;
00129
00130 SG_FREE(isAA);
00131 isAA=NULL;
00132 SG_FREE(aaIndex);
00133 aaIndex=NULL;
00134
00135 CKernel::cleanup();
00136 }
00137
00138
00139
00140
00141 void CLocalAlignmentStringKernel::init_logsum(void){
00142 int32_t i;
00143 for (i = 0; i < LOGSUM_TBL; i++)
00144 logsum_lookup[i] = (int32_t) (INTSCALE*
00145 (log(1.+exp( (float32_t) -i/INTSCALE))));
00146 }
00147
00148 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00149 {
00150 int32_t diff;
00151 static int32_t firsttime=1;
00152
00153 if (firsttime)
00154 {
00155 init_logsum();
00156 firsttime =0;
00157 }
00158
00159 diff=p1-p2;
00160 if (diff>=LOGSUM_TBL) return p1;
00161 else if (diff<=-LOGSUM_TBL) return p2;
00162 else if (diff>0) return p1+logsum_lookup[diff];
00163 else return p2+logsum_lookup[-diff];
00164 }
00165
00166
00167 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00168 {
00169 if (p1 > p2)
00170 return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00171 else
00172 return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00173 }
00174
00175
00176 void CLocalAlignmentStringKernel::init_static_variables()
00177
00178 {
00179 register int32_t i;
00180
00181
00182 if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL)
00183 SG_ERROR("run out o memory");
00184 for (i=0;i<NAA;i++)
00185 aaIndex[aaList[i]-'A']=i;
00186
00187
00188 if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL)
00189 SG_ERROR("run out of memory");
00190 for (i=0;i<NAA;i++)
00191 isAA[(int32_t)aaList[i]]=1;
00192
00193
00194 for (i=0 ; i<NAA*(NAA+1)/2; i++)
00195 scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE);
00196
00197
00198
00199 m_opening = (int32_t) floor(m_opening * SCALING*INTSCALE);
00200 m_extension = (int32_t) floor(m_extension * SCALING*INTSCALE);
00201 }
00202
00203
00204
00205
00206
00207
00208 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00209 int32_t* aaX, int32_t* aaY,
00210 int32_t nX, int32_t nY )
00211 {
00212 register int32_t
00213 i,j,
00214 cur, old,
00215 curpos, frompos;
00216
00217 int32_t
00218 *logX,
00219 *logY,
00220 *logM,
00221 *logX2,
00222 *logY2,
00223
00224 aux , aux2;
00225 int32_t
00226 cl;
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 cl = nY+1;
00239
00240 logM=SG_MALLOC(int32_t, 2*cl);
00241 logX=SG_MALLOC(int32_t, 2*cl);
00242 logY=SG_MALLOC(int32_t, 2*cl);
00243 logX2=SG_MALLOC(int32_t, 2*cl);
00244 logY2=SG_MALLOC(int32_t, 2*cl);
00245
00246
00247
00248
00249
00250
00251 for (j=0;j<cl;j++) {
00252 logM[j]=LOG0;
00253 logX[j]=LOG0;
00254 logY[j]=LOG0;
00255 logX2[j]=LOG0;
00256 logY2[j]=LOG0;
00257
00258 }
00259
00260
00261 cur = 1;
00262 old = 0;
00263
00264
00265
00266
00267
00268
00269
00270 for (i=1;i<=nX;i++) {
00271
00272
00273 curpos = cur*cl;
00274 logM[curpos] = LOG0;
00275 logX[curpos] = LOG0;
00276 logY[curpos] = LOG0;
00277 logX2[curpos] = LOG0;
00278 logY2[curpos] = LOG0;
00279
00280
00281 for (j=1;j<=nY;j++) {
00282
00283 curpos = cur*cl + j;
00284
00285
00286
00287
00288 frompos = old*cl + j;
00289
00290
00291 logX[curpos] = LOGP( - m_opening + logM[frompos] , - m_extension + logX[frompos] );
00292
00293
00294
00295 logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00296
00297
00298
00299
00300 frompos = cur*cl + j-1;
00301
00302
00303 aux = LOGP( - m_opening + logM[frompos] , - m_extension + logY[frompos] );
00304 logY[curpos] = LOGP( aux , - m_opening + logX[frompos] );
00305
00306
00307 aux = LOGP( logM[frompos] , logY2[frompos] );
00308 logY2[curpos] = LOGP( aux , logX2[frompos] );
00309
00310
00311
00312
00313 frompos = old*cl + j-1;
00314
00315 aux = LOGP( logX[frompos] , logY[frompos] );
00316 aux2 = LOGP( 0 , logM[frompos] );
00317 logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00318
00319
00320
00321
00322 }
00323
00324
00325
00326 cur = 1-cur;
00327 old = 1-old;
00328
00329 }
00330
00331
00332
00333
00334
00335 curpos = old*cl + nY;
00336 aux = LOGP( logX2[curpos] , logY2[curpos] );
00337 aux2 = LOGP( 0 , logM[curpos] );
00338
00339
00340
00341 SG_FREE(logM);
00342 SG_FREE(logX);
00343 SG_FREE(logY);
00344 SG_FREE(logX2);
00345 SG_FREE(logY2);
00346
00347
00348 return (float32_t)LOGP(aux,aux2)/INTSCALE;
00349 }
00350
00351
00352
00353
00354
00355
00356
00357
00358 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00359 {
00360 int32_t *aax,*aay;
00361 int32_t lx=0,ly=0;
00362 int32_t i,j;
00363
00364 bool free_x, free_y;
00365 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00366 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00367 ASSERT(x && y);
00368
00369 if ((lx<1) || (ly<1))
00370 SG_ERROR("empty chain");
00371
00372
00373
00374 if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL)
00375 SG_ERROR("run out of memory");
00376 if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL)
00377 SG_ERROR("run out of memory");
00378
00379
00380
00381 j=0;
00382 for (i=0 ; i<lx ; i++)
00383 if (isAA[toupper(x[i])])
00384 aax[j++] = aaIndex[toupper(x[i])-'A'];
00385 lx = j;
00386 j=0;
00387 for (i=0 ; i<ly ; i++)
00388 if (isAA[toupper(y[i])])
00389 aay[j++] = aaIndex[toupper(y[i])-'A'];
00390 ly = j;
00391
00392
00393
00394 float64_t result=LAkernelcompute(aax,aay,lx,ly);
00395
00396
00397 SG_FREE(aax);
00398 SG_FREE(aay);
00399
00400 ((CStringFeatures<char>*) lhs)->free_feature_vector(x, idx_x, free_x);
00401 ((CStringFeatures<char>*) rhs)->free_feature_vector(y, idx_y, free_y);
00402
00403 return result;
00404 }
00405
00406 void CLocalAlignmentStringKernel::init()
00407 {
00408 initialized = false;
00409 isAA = NULL;
00410 aaIndex = NULL;
00411
00412 m_opening = 10;
00413 m_extension = 2;
00414
00415 scaled_blosum=SG_MALLOC(int32_t, sizeof(blosum));
00416 init_logsum();
00417
00418 m_parameters->add(&initialized, "initialized", "if kernel is initalized");
00419 m_parameters->add(&m_opening, "opening", "opening gap opening penalty");
00420 m_parameters->add(&m_extension, "extension", "extension gap extension penalty");
00421 }