00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include <shogun/lib/common.h>
00016 #include <shogun/io/SGIO.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/kernel/SparseSpatialSampleStringKernel.h>
00019 #include <shogun/features/StringFeatures.h>
00020
00021 using namespace shogun;
00022
00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel()
00024 : CStringKernel<char>(0), t(2), d(5)
00025 {
00026 }
00027
00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l,
00029 CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5)
00030 {
00031 init(l, r);
00032 }
00033
00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r)
00035 {
00036 CStringKernel<char>::init(l, r);
00037 return init_normalizer();
00038 }
00039
00040 void CSparseSpatialSampleStringKernel::cleanup()
00041 {
00042 CKernel::cleanup();
00043 }
00044
00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel()
00046 {
00047 }
00048
00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2)
00050 {
00051 int i, j;
00052 int n, nfeat;
00053 int *group;
00054 int *features;
00055 int *s;
00056 int c;
00057 SSKFeatures *F;
00058
00059 nfeat = 0;
00060 for (i = 0; i < nStr; ++i)
00061 nfeat += len[i] - d1 -d2;
00062 group = SG_MALLOC(int, nfeat);
00063 features = SG_MALLOC(int, nfeat*3);
00064 c = 0;
00065 for (i = 0; i < nStr; ++i)
00066 {
00067 n = len[i] - d1 - d2;
00068 s = S[i];
00069 for (j = 0; j < n; ++j)
00070 {
00071 features[c] = s[j];
00072 features[c+nfeat] = s[j+d1];
00073 features[c+2*nfeat] = s[j+d1+d2];
00074 group[c] = i;
00075 c++;
00076 }
00077 }
00078 ASSERT(nfeat==c);
00079 F = SG_MALLOC(SSKFeatures, 1);
00080 (*F).features = features;
00081 (*F).group = group;
00082 (*F).n = nfeat;
00083 return F;
00084 }
00085
00086
00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1)
00088 {
00089 int i, j;
00090 int n, nfeat;
00091 int *group;
00092 int *features;
00093 int *s;
00094 int c;
00095 SSKFeatures *F;
00096
00097 nfeat = 0;
00098 for (i = 0; i < nStr; ++i)
00099 nfeat += len[i] - d1;
00100 group = SG_MALLOC(int, nfeat);
00101 features = SG_MALLOC(int, nfeat*2);
00102 c = 0;
00103 for (i = 0; i < nStr; ++i)
00104 {
00105 n = len[i] - d1;
00106 s = S[i];
00107 for (j = 0; j < n; ++j)
00108 {
00109 features[c] = s[j];
00110 features[c+nfeat] = s[j+d1];
00111 group[c] = i;
00112 c++;
00113 }
00114 }
00115 if (nfeat!=c)
00116 printf("Something is wrong...\n");
00117 F = SG_MALLOC(SSKFeatures, 1);
00118 (*F).features = features;
00119 (*F).group = group;
00120 (*F).n = nfeat;
00121 return F;
00122 }
00123
00124
00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b)
00126 {
00127 int d1;
00128 SSKFeatures *features;
00129 int *sortIdx;
00130 int *features_srt;
00131 int *group_srt;
00132 int maxIdx;
00133 int **S=NULL;
00134 int *len=NULL;
00135 int nStr=0, nfeat;
00136 int i;
00137 int na=0;
00138 int *K=NULL;
00139
00140 for (d1 = 1; d1 <= d; ++d1)
00141 {
00142 if ( isVerbose ) printf("Extracting features..."), fflush( stdout );
00143 features = extractDouble(S,len,nStr,d1);
00144 nfeat = (*features).n;
00145 printf("d=%d: %d features\n", d1, nfeat);
00146 maxIdx = 0;
00147 for (i = 0; i < nfeat*2; ++i)
00148 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00149 if (na < maxIdx+1)
00150 {
00151 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00152 printf("\tUsing [0,%d] instead\n", maxIdx);
00153 na = maxIdx+1;
00154 }
00155 if (isVerbose)
00156 {
00157 printf("done.\n");
00158 printf("Sorting...");
00159 fflush( stdout );
00160 }
00161 sortIdx = cntsrtna((*features).features,2,(*features).n,na);
00162 if (isVerbose) printf("done.\n");
00163 features_srt = SG_MALLOC(int, nfeat*2);
00164 group_srt = SG_MALLOC(int, nfeat);
00165 for (i = 0; i < nfeat; ++i)
00166 {
00167 features_srt[i]=(*features).features[sortIdx[i]];
00168 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00169 group_srt[i] = (*features).group[sortIdx[i]];
00170 }
00171 SG_FREE(sortIdx);
00172 SG_FREE((*features).features);
00173 SG_FREE((*features).group);
00174 SG_FREE(features);
00175 if (isVerbose)
00176 {
00177 printf("Counting...");
00178 fflush( stdout );
00179 }
00180 countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr);
00181 SG_FREE(features_srt);
00182 SG_FREE(group_srt);
00183 if (isVerbose)
00184 {
00185 printf("done.");
00186 }
00187 }
00188 }
00189
00190 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b)
00191 {
00192 int d1, d2;
00193 SSKFeatures *features;
00194 int *sortIdx;
00195 int *features_srt;
00196 int *group_srt;
00197 int maxIdx;
00198 int **S=NULL;
00199 int *len=NULL;
00200 int nStr=0, nfeat;
00201 int i;
00202 int na=0;
00203 int *K=NULL;
00204
00205 for (d1 = 1; d1 <= d; ++d1)
00206 {
00207 for (d2 = 1; d2 <= d; ++d2)
00208 {
00209 if (isVerbose)
00210 {
00211 printf("Extracting features...");
00212 fflush( stdout );
00213 }
00214 features = extractTriple(S,len,nStr,d1,d2);
00215 nfeat = (*features).n;
00216 printf("(%d,%d): %d features\n", d1, d2, nfeat);
00217 maxIdx = 0;
00218 for (i = 0; i < nfeat*3; ++i)
00219 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00220 if (na < maxIdx+1)
00221 {
00222 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00223 printf("\tUsing [0,%d] instead\n", maxIdx);
00224 na = maxIdx+1;
00225 }
00226 if (isVerbose)
00227 {
00228 printf("done.\n");
00229 printf("Sorting...");
00230 fflush( stdout );
00231 }
00232 sortIdx = cntsrtna((*features).features,3,(*features).n,na);
00233 if (isVerbose)
00234 {
00235 printf("done.\n");
00236 }
00237 features_srt = SG_MALLOC(int, nfeat*3);
00238 group_srt = SG_MALLOC(int, nfeat);
00239 for (i = 0; i < nfeat; ++i)
00240 {
00241 features_srt[i]=(*features).features[sortIdx[i]];
00242 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00243 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat];
00244 group_srt[i] = (*features).group[sortIdx[i]];
00245 }
00246 SG_FREE(sortIdx);
00247 SG_FREE((*features).features);
00248 SG_FREE((*features).group);
00249 SG_FREE(features);
00250 if (isVerbose)
00251 {
00252 printf("Counting...");
00253 fflush( stdout );
00254 }
00255 countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr);
00256 SG_FREE(features_srt);
00257 SG_FREE(group_srt);
00258 if (isVerbose)
00259 {
00260 printf("done.\n");
00261 }
00262 }
00263 }
00264 }
00265
00266 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr)
00267 {
00268 char same;
00269 int i, j;
00270 int cu;
00271 long int ucnt;
00272 long int startInd, endInd, j1;
00273 int *curfeat, *ucnts, *updind;
00274
00275 curfeat = SG_MALLOC(int, k);
00276 ucnts = SG_MALLOC(int, nStr);
00277 updind = SG_MALLOC(int, nStr);
00278 i = 0;
00279 ucnt = 0;
00280 while (i<r)
00281 {
00282 for (j = 0; j < k; ++j)
00283 curfeat[j]=sx[i+j*r];
00284 same=1;
00285 for (j = 0;j < k; ++j)
00286 if (curfeat[j]!=sx[i+j*r])
00287 {
00288 same=0;
00289 break;
00290 }
00291 startInd=i;
00292 while (same && i<r)
00293 {
00294 i++;
00295 if (i >= r) break;
00296 same = 1;
00297 for (j = 0; j < k; ++j)
00298 if (curfeat[j]!=sx[i+j*r])
00299 {
00300 same=0;
00301 break;
00302 }
00303 }
00304 endInd= (i<r) ? (i-1):(r-1);
00305 ucnt++;
00306 if ((long int)endInd-startInd+1>2*nStr)
00307 {
00308 for (j = 0; j < nStr; ++j) ucnts[j]=0;
00309 for (j = startInd;j <= endInd; ++j) ucnts[g[j]]++;
00310 cu=0;
00311 for (j=0;j<nStr;j++)
00312 {
00313 if (ucnts[j]>0)
00314 {
00315 updind[cu] = j;
00316 cu++;
00317 }
00318 }
00319 for (j=0;j<cu;j++)
00320 for (j1=0;j1<cu;j1++)
00321 outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]];
00322 }
00323 else
00324 {
00325 for (j = startInd;j <= endInd; ++j)
00326 for (j1 = startInd;j1 <= endInd; ++j1)
00327 outK[ g[j]+nStr*g[j1] ]++;
00328 }
00329 }
00330 SG_FREE(updind);
00331 SG_FREE(ucnts);
00332 SG_FREE(curfeat);
00333 }
00334
00335 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na)
00336 {
00337 int *sxc, *bc, *sxl, *cc, *regroup;
00338 int i, j;
00339
00340 sxc = SG_MALLOC(int, na);
00341 bc = SG_MALLOC(int, na);
00342 sxl = SG_MALLOC(int, r);
00343 cc = SG_MALLOC(int, r);
00344 regroup = SG_MALLOC(int, r);
00345
00346 for (i = 0; i < r; ++i)
00347 regroup[i]=i;
00348 for (j = k-1; j >= 0; --j)
00349 {
00350 for(i = 0; i < na; ++i)
00351 sxc[i]=0;
00352 for (i = 0; i < r; ++i)
00353 {
00354 cc[i]=sx[regroup[i]+j*r];
00355 sxc[ cc[i] ]++;
00356 }
00357 bc[0]=0;
00358 for (i = 1;i < na; ++i)
00359 bc[i]=bc[i-1]+sxc[i-1];
00360 for (i = 0; i < r; ++i)
00361 sxl[bc[ cc[i] ]++] = regroup[i];
00362 for (i = 0; i < r; ++i)
00363 regroup[i] = sxl[i];
00364 }
00365 SG_FREE(sxl); SG_FREE(bc); SG_FREE(sxc); SG_FREE(cc);
00366
00367 return regroup;
00368 }
00369
00370 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b)
00371 {
00372 if (t==2)
00373 compute_double(idx_a, idx_b);
00374 if (t==3)
00375 compute_triple(idx_a, idx_b);
00376
00377 SG_ERROR("t out of range - shouldn't happen\n");
00378 return -1;
00379 }