00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Mathematics.h"
00018 #include "kernel/SparseSpatialSampleStringKernel.h"
00019 #include "features/StringFeatures.h"
00020
00021 using namespace shogun;
00022
00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel()
00024 : CStringKernel<char>(0), t(2), d(5)
00025 {
00026 }
00027
00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l,
00029 CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5)
00030 {
00031 init(l, r);
00032 }
00033
00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r)
00035 {
00036 CStringKernel<char>::init(l, r);
00037 return init_normalizer();
00038 }
00039
00040 void CSparseSpatialSampleStringKernel::cleanup()
00041 {
00042 CKernel::cleanup();
00043 }
00044
00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel()
00046 {
00047 }
00048
00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2)
00050 {
00051 int i, j;
00052 int n, nfeat;
00053 int *group;
00054 int *features;
00055 int *s;
00056 int c;
00057 SSKFeatures *F;
00058
00059 nfeat = 0;
00060 for (i = 0; i < nStr; ++i)
00061 nfeat += len[i] - d1 -d2;
00062 group = (int *)malloc(nfeat*sizeof(int));
00063 features = (int *)malloc(nfeat*3*sizeof(int *));
00064 c = 0;
00065 for (i = 0; i < nStr; ++i)
00066 {
00067 n = len[i] - d1 - d2;
00068 s = S[i];
00069 for (j = 0; j < n; ++j)
00070 {
00071 features[c] = s[j];
00072 features[c+nfeat] = s[j+d1];
00073 features[c+2*nfeat] = s[j+d1+d2];
00074 group[c] = i;
00075 c++;
00076 }
00077 }
00078 ASSERT(nfeat==c);
00079 F = (SSKFeatures *)malloc(sizeof(SSKFeatures));
00080 (*F).features = features;
00081 (*F).group = group;
00082 (*F).n = nfeat;
00083 return F;
00084 }
00085
00086
00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1)
00088 {
00089 int i, j;
00090 int n, nfeat;
00091 int *group;
00092 int *features;
00093 int *s;
00094 int c;
00095 SSKFeatures *F;
00096
00097 nfeat = 0;
00098 for (i = 0; i < nStr; ++i)
00099 nfeat += len[i] - d1;
00100 group = (int *)malloc(nfeat*sizeof(int));
00101 features = (int *)malloc(nfeat*2*sizeof(int *));
00102 c = 0;
00103 for (i = 0; i < nStr; ++i)
00104 {
00105 n = len[i] - d1;
00106 s = S[i];
00107 for (j = 0; j < n; ++j)
00108 {
00109 features[c] = s[j];
00110 features[c+nfeat] = s[j+d1];
00111 group[c] = i;
00112 c++;
00113 }
00114 }
00115 if (nfeat!=c)
00116 printf("Something is wrong...\n");
00117 F = (SSKFeatures *)malloc(sizeof(SSKFeatures));
00118 (*F).features = features;
00119 (*F).group = group;
00120 (*F).n = nfeat;
00121 return F;
00122 }
00123
00124
00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b)
00126 {
00127 int d1, d2;
00128 SSKFeatures *features;
00129 int *sortIdx;
00130 int *features_srt;
00131 int *group_srt;
00132 int maxIdx;
00133 char *kernelfilename;
00134 int **S;
00135 int *len;
00136 int nStr, nfeat;
00137 int i;
00138 int na;
00139 int *K;
00140
00141 for (d1 = 1; d1 <= d; ++d1)
00142 {
00143 if ( isVerbose ) printf("Extracting features..."), fflush( stdout );
00144 features = extractDouble(S,len,nStr,d1);
00145 nfeat = (*features).n;
00146 printf("d=%d: %d features\n", d1, nfeat);
00147 maxIdx = 0;
00148 for (i = 0; i < nfeat*2; ++i)
00149 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00150 if (na < maxIdx+1)
00151 {
00152 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00153 printf("\tUsing [0,%d] instead\n", maxIdx);
00154 na = maxIdx+1;
00155 }
00156 if (isVerbose)
00157 {
00158 printf("done.\n");
00159 printf("Sorting...");
00160 fflush( stdout );
00161 }
00162 sortIdx = cntsrtna((*features).features,2,(*features).n,na);
00163 if (isVerbose) printf("done.\n");
00164 features_srt = (int *)malloc(nfeat*2*sizeof(int *));
00165 group_srt = (int *)malloc(nfeat*sizeof(int));
00166 for (i = 0; i < nfeat; ++i)
00167 {
00168 features_srt[i]=(*features).features[sortIdx[i]];
00169 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00170 group_srt[i] = (*features).group[sortIdx[i]];
00171 }
00172 free(sortIdx);
00173 free((*features).features);
00174 free((*features).group);
00175 free(features);
00176 if (isVerbose)
00177 {
00178 printf("Counting...");
00179 fflush( stdout );
00180 }
00181 countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr);
00182 free(features_srt);
00183 free(group_srt);
00184 if (isVerbose)
00185 {
00186 printf("done.");
00187 }
00188 }
00189 }
00190
00191 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b)
00192 {
00193 int d1, d2;
00194 SSKFeatures *features;
00195 int *sortIdx;
00196 int *features_srt;
00197 int *group_srt;
00198 int maxIdx;
00199 char *kernelfilename;
00200 int **S;
00201 int *len;
00202 int nStr, nfeat;
00203 int i;
00204 int na;
00205 int *K;
00206
00207 for (d1 = 1; d1 <= d; ++d1)
00208 {
00209 for (d2 = 1; d2 <= d; ++d2)
00210 {
00211 if (isVerbose)
00212 {
00213 printf("Extracting features...");
00214 fflush( stdout );
00215 }
00216 features = extractTriple(S,len,nStr,d1,d2);
00217 nfeat = (*features).n;
00218 printf("(%d,%d): %d features\n", d1, d2, nfeat);
00219 maxIdx = 0;
00220 for (i = 0; i < nfeat*3; ++i)
00221 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00222 if (na < maxIdx+1)
00223 {
00224 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00225 printf("\tUsing [0,%d] instead\n", maxIdx);
00226 na = maxIdx+1;
00227 }
00228 if (isVerbose)
00229 {
00230 printf("done.\n");
00231 printf("Sorting...");
00232 fflush( stdout );
00233 }
00234 sortIdx = cntsrtna((*features).features,3,(*features).n,na);
00235 if (isVerbose)
00236 {
00237 printf("done.\n");
00238 }
00239 features_srt = (int *)malloc(nfeat*3*sizeof(int *));
00240 group_srt = (int *)malloc(nfeat*sizeof(int));
00241 for (i = 0; i < nfeat; ++i)
00242 {
00243 features_srt[i]=(*features).features[sortIdx[i]];
00244 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00245 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat];
00246 group_srt[i] = (*features).group[sortIdx[i]];
00247 }
00248 free(sortIdx);
00249 free((*features).features);
00250 free((*features).group);
00251 free(features);
00252 if (isVerbose)
00253 {
00254 printf("Counting...");
00255 fflush( stdout );
00256 }
00257 countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr);
00258 free(features_srt);
00259 free(group_srt);
00260 if (isVerbose)
00261 {
00262 printf("done.\n");
00263 }
00264 }
00265 }
00266 }
00267
00268 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr)
00269 {
00270 char same;
00271 int i, j;
00272 int cu;
00273 long int ucnt;
00274 long int startInd, endInd, j1;
00275 int *curfeat, *ucnts, *updind;
00276
00277 curfeat = (int *)malloc(k*sizeof(int));
00278 ucnts= (int *)malloc(nStr*sizeof(int));
00279 updind = (int *)malloc(nStr*sizeof(int));
00280 i = 0;
00281 ucnt = 0;
00282 while (i<r)
00283 {
00284 for (j = 0; j < k; ++j)
00285 curfeat[j]=sx[i+j*r];
00286 same=1;
00287 for (j = 0;j < k; ++j)
00288 if (curfeat[j]!=sx[i+j*r])
00289 {
00290 same=0;
00291 break;
00292 }
00293 startInd=i;
00294 while (same && i<r)
00295 {
00296 i++;
00297 if (i >= r) break;
00298 same = 1;
00299 for (j = 0; j < k; ++j)
00300 if (curfeat[j]!=sx[i+j*r])
00301 {
00302 same=0;
00303 break;
00304 }
00305 }
00306 endInd= (i<r) ? (i-1):(r-1);
00307 ucnt++;
00308 if ((long int)endInd-startInd+1>2*nStr)
00309 {
00310 for (j = 0; j < nStr; ++j) ucnts[j]=0;
00311 for (j = startInd;j <= endInd; ++j) ucnts[g[j]]++;
00312 cu=0;
00313 for (j=0;j<nStr;j++)
00314 {
00315 if (ucnts[j]>0)
00316 {
00317 updind[cu] = j;
00318 cu++;
00319 }
00320 }
00321 for (j=0;j<cu;j++)
00322 for (j1=0;j1<cu;j1++)
00323 outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]];
00324 }
00325 else
00326 {
00327 for (j = startInd;j <= endInd; ++j)
00328 for (j1 = startInd;j1 <= endInd; ++j1)
00329 outK[ g[j]+nStr*g[j1] ]++;
00330 }
00331 }
00332 free(updind);
00333 free(ucnts);
00334 free(curfeat);
00335 }
00336
00337 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na)
00338 {
00339 int *sxc, *bc, *sxl, *cc, *regroup;
00340 int i, j;
00341
00342 sxc = (int *)malloc(na*sizeof(int));
00343 bc = (int *)malloc(na*sizeof(int));
00344 sxl = (int *)malloc(r*sizeof(int));
00345 cc = (int *)malloc(r*sizeof(int));
00346 regroup = (int *)malloc(r*sizeof(int));
00347
00348 for (i = 0; i < r; ++i)
00349 regroup[i]=i;
00350 for (j = k-1; j >= 0; --j)
00351 {
00352 for(i = 0; i < na; ++i)
00353 sxc[i]=0;
00354 for (i = 0; i < r; ++i)
00355 {
00356 cc[i]=sx[regroup[i]+j*r];
00357 sxc[ cc[i] ]++;
00358 }
00359 bc[0]=0;
00360 for (i = 1;i < na; ++i)
00361 bc[i]=bc[i-1]+sxc[i-1];
00362 for (i = 0; i < r; ++i)
00363 sxl[bc[ cc[i] ]++] = regroup[i];
00364 for (i = 0; i < r; ++i)
00365 regroup[i] = sxl[i];
00366 }
00367 free(sxl); free(bc); free(sxc); free(cc);
00368
00369 return regroup;
00370 }
00371
00372 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b)
00373 {
00374 if (t==2)
00375 compute_double(idx_a, idx_b);
00376 if (t==3)
00377 compute_triple(idx_a, idx_b);
00378
00379 SG_ERROR("t out of range - shouldn't happen\n");
00380 return -1;
00381 }