00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <shogun/kernel/string/LocalAlignmentStringKernel.h>
00028
00029 using namespace shogun;
00030
00031
00032
00033
00034
00035 const int32_t NAA=20;
00036 const int32_t NLET=26;
00037 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV";
00038
00039
00040
00041
00042
00043
00044 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00045 6,
00046 -2, 8,
00047 -2, -1, 9,
00048 -3, -2, 2, 9,
00049 -1, -5, -4, -5, 13,
00050 -1, 1, 0, 0, -4, 8,
00051 -1, 0, 0, 2, -5, 3, 7,
00052 0, -3, -1, -2, -4, -3, -3, 8,
00053 -2, 0, 1, -2, -4, 1, 0, -3, 11,
00054 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
00055 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
00056 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
00057 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
00058 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
00059 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
00060 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
00061 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
00062 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
00063 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
00064 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
00065
00066
00067 static int32_t BINDEX(int32_t i, int32_t j)
00068 {
00069 return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2));
00070 }
00071
00072
00073
00074
00075
00076 const float64_t SCALING=0.1;
00077
00078
00079
00080
00081
00082
00083
00084 #define LOGP(x,y) LogSum(x,y)
00085
00086
00087
00088
00089
00090
00091
00092 const float64_t LOG0=-10000;
00093 const float64_t INTSCALE=1000.0;
00094
00095 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00096
00097 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00098 : CStringKernel<char>(size)
00099 {
00100 SG_UNSTABLE("LocalAlignmentStringKernel");
00101 init();
00102 init_static_variables();
00103 }
00104
00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00106 CStringFeatures<char>* l, CStringFeatures<char>* r,
00107 float64_t opening, float64_t extension)
00108 : CStringKernel<char>()
00109 {
00110 SG_UNSTABLE("LocalAlignmentStringKernel");
00111 init();
00112 m_opening=opening;
00113 m_extension=extension;
00114 init_static_variables();
00115 init(l, r);
00116 }
00117
00118 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00119 {
00120 cleanup();
00121 }
00122
00123 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00124 {
00125 CStringKernel<char>::init(l, r);
00126 initialized=true;
00127 return init_normalizer();
00128 }
00129
00130 void CLocalAlignmentStringKernel::cleanup()
00131 {
00132 SG_FREE(scaled_blosum);
00133 scaled_blosum=NULL;
00134
00135 SG_FREE(isAA);
00136 isAA=NULL;
00137 SG_FREE(aaIndex);
00138 aaIndex=NULL;
00139
00140 CKernel::cleanup();
00141 }
00142
00143
00144
00145
00146 void CLocalAlignmentStringKernel::init_logsum(){
00147 int32_t i;
00148 for (i=0; i<LOGSUM_TBL; i++)
00149 logsum_lookup[i]=int32_t(INTSCALE*
00150 (log(1.+exp((float32_t)-i/INTSCALE))));
00151 }
00152
00153 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00154 {
00155 int32_t diff;
00156 static int32_t firsttime=1;
00157
00158 if (firsttime)
00159 {
00160 init_logsum();
00161 firsttime =0;
00162 }
00163
00164 diff=p1-p2;
00165 if (diff>=LOGSUM_TBL)
00166 return p1;
00167 else if (diff<=-LOGSUM_TBL)
00168 return p2;
00169 else if (diff>0)
00170 return p1+logsum_lookup[diff];
00171 else
00172 return p2+logsum_lookup[-diff];
00173 }
00174
00175
00176 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00177 {
00178 if (p1>p2)
00179 return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1));
00180 else
00181 return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2));
00182 }
00183
00184
00185 void CLocalAlignmentStringKernel::init_static_variables()
00186
00187 {
00188 register int32_t i;
00189
00190
00191 aaIndex = SG_CALLOC(int32_t, NLET);
00192 for (i=0;i<NAA;i++)
00193 aaIndex[aaList[i]-'A']=i;
00194
00195
00196 isAA = SG_CALLOC(int32_t, 256);
00197 for (i=0;i<NAA;i++)
00198 isAA[(int32_t)aaList[i]]=1;
00199
00200
00201 for (i=0 ; i<NAA*(NAA+1)/2; i++)
00202 scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE);
00203
00204
00205
00206 m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE);
00207 m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE);
00208 }
00209
00210
00211
00212
00213
00214
00215 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00216 int32_t* aaX, int32_t* aaY,
00217 int32_t nX, int32_t nY )
00218 {
00219 register int32_t
00220 i,j,
00221 cur, old,
00222 curpos, frompos;
00223
00224 int32_t
00225 *logX,
00226 *logY,
00227 *logM,
00228 *logX2,
00229 *logY2;
00230
00231 int32_t aux, aux2;
00232 int32_t cl;
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244 cl=nY+1;
00245
00246 logM=SG_CALLOC(int32_t, 2*cl);
00247 logX=SG_CALLOC(int32_t, 2*cl);
00248 logY=SG_CALLOC(int32_t, 2*cl);
00249 logX2=SG_CALLOC(int32_t, 2*cl);
00250 logY2=SG_CALLOC(int32_t, 2*cl);
00251
00252
00253
00254
00255
00256
00257 for (j=0; j<cl; j++) {
00258 logM[j]=LOG0;
00259 logX[j]=LOG0;
00260 logY[j]=LOG0;
00261 logX2[j]=LOG0;
00262 logY2[j]=LOG0;
00263 }
00264
00265
00266 cur=1;
00267 old=0;
00268
00269
00270
00271
00272
00273
00274
00275 for (i=1; i<=nX; i++) {
00276
00277
00278 curpos=cur*cl;
00279 logM[curpos]=LOG0;
00280 logX[curpos]=LOG0;
00281 logY[curpos]=LOG0;
00282 logX2[curpos]=LOG0;
00283 logY2[curpos]=LOG0;
00284
00285
00286 for (j=1; j<=nY; j++) {
00287
00288 curpos=cur*cl+j;
00289
00290
00291
00292
00293 frompos=old*cl+j;
00294
00295
00296 logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]);
00297
00298
00299
00300 logX2[curpos]=LOGP(logM[frompos], logX2[frompos]);
00301
00302
00303
00304
00305 frompos=cur*cl+j-1;
00306
00307
00308 aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]);
00309 logY[curpos] = LOGP(aux, -m_opening+logX[frompos]);
00310
00311
00312 aux=LOGP(logM[frompos], logY2[frompos]);
00313 logY2[curpos]=LOGP(aux, logX2[frompos]);
00314
00315
00316
00317
00318 frompos=old*cl+j-1;
00319
00320 aux=LOGP(logX[frompos], logY[frompos]);
00321 aux2=LOGP(0, logM[frompos]);
00322 logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])];
00323
00324
00325
00326
00327
00328 }
00329
00330
00331
00332 cur=1-cur;
00333 old=1-old;
00334
00335 }
00336
00337
00338
00339
00340
00341 curpos=old*cl+nY;
00342 aux=LOGP(logX2[curpos], logY2[curpos]);
00343 aux2=LOGP(0, logM[curpos]);
00344
00345
00346
00347 SG_FREE(logM);
00348 SG_FREE(logX);
00349 SG_FREE(logY);
00350 SG_FREE(logX2);
00351 SG_FREE(logY2);
00352
00353
00354 return (float32_t)LOGP(aux,aux2)/INTSCALE;
00355 }
00356
00357
00358
00359
00360
00361
00362
00363
00364 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00365 {
00366 int32_t *aax, *aay;
00367 int32_t lx=0, ly=0;
00368 int32_t i, j;
00369
00370 bool free_x, free_y;
00371 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00372 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00373 ASSERT(x && y);
00374
00375 if ( (lx<1) || (ly<1) )
00376 SG_ERROR("empty chain");
00377
00378
00379
00380 aax = SG_CALLOC(int32_t, lx);
00381 aay = SG_CALLOC(int32_t, ly);
00382
00383
00384
00385 j=0;
00386 for (i=0; i<lx; i++)
00387 if (isAA[toupper(x[i])])
00388 aax[j++]=aaIndex[toupper(x[i])-'A'];
00389 lx=j;
00390 j=0;
00391 for (i=0; i<ly; i++)
00392 if (isAA[toupper(y[i])])
00393 aay[j++]=aaIndex[toupper(y[i])-'A'];
00394 ly=j;
00395
00396
00397
00398 float64_t result=LAkernelcompute(aax, aay, lx, ly);
00399
00400
00401 SG_FREE(aax);
00402 SG_FREE(aay);
00403
00404 ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x);
00405 ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y);
00406
00407 return result;
00408 }
00409
00410 void CLocalAlignmentStringKernel::init()
00411 {
00412 initialized=false;
00413 isAA=NULL;
00414 aaIndex=NULL;
00415
00416 m_opening=10;
00417 m_extension=2;
00418
00419 scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
00420 init_logsum();
00421
00422 SG_ADD(&initialized, "initialized", "If kernel is initalized.",
00423 MS_NOT_AVAILABLE);
00424 SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE);
00425 SG_ADD(&m_extension, "extension", "Extension gap extension penalty.",
00426 MS_AVAILABLE);
00427 }