SparseSpatialSampleStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  *
00010  * Based on code from Pavel Kuksa <pkuksa@cs.rutgers.edu> and
00011  * Vladimir Pavlovic <vladimir@cs.rutgers.edu> originally
00012  * released under the new BSD License.
00013  */
00014 
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Mathematics.h"
00018 #include "kernel/SparseSpatialSampleStringKernel.h"
00019 #include "features/StringFeatures.h"
00020 
00021 using namespace shogun;
00022 
00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel()
00024 : CStringKernel<char>(0), t(2), d(5)
00025 {
00026 }
00027 
00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l,
00029         CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5)
00030 {
00031     init(l, r);
00032 }
00033 
00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r)
00035 {
00036     CStringKernel<char>::init(l, r);
00037     return init_normalizer();
00038 }
00039 
00040 void CSparseSpatialSampleStringKernel::cleanup()
00041 {
00042     CKernel::cleanup();
00043 }
00044 
00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel()
00046 {
00047 }
00048 
00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2)
00050 {
00051     int i, j;
00052     int n, nfeat;
00053     int *group;
00054     int *features;
00055     int *s;
00056     int c;
00057     SSKFeatures *F;
00058 
00059     nfeat = 0;
00060     for (i = 0; i < nStr; ++i)
00061         nfeat += len[i] - d1 -d2;
00062     group = (int *)malloc(nfeat*sizeof(int));
00063     features = (int *)malloc(nfeat*3*sizeof(int *));
00064     c = 0;
00065     for (i = 0; i < nStr; ++i)
00066     {
00067         n = len[i] - d1 - d2;
00068         s = S[i];
00069         for (j = 0; j < n; ++j)
00070         {
00071             features[c] = s[j];
00072             features[c+nfeat] = s[j+d1];
00073             features[c+2*nfeat] = s[j+d1+d2];
00074             group[c] = i;
00075             c++;
00076         }
00077     }
00078     ASSERT(nfeat==c);
00079     F = (SSKFeatures *)malloc(sizeof(SSKFeatures));
00080     (*F).features = features;
00081     (*F).group = group;
00082     (*F).n = nfeat;
00083     return F;
00084 }
00085 
00086 
00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1)
00088 {
00089     int i, j;
00090     int n, nfeat;
00091     int *group;
00092     int *features;
00093     int *s;
00094     int c;
00095     SSKFeatures *F;
00096 
00097     nfeat = 0;
00098     for (i = 0; i < nStr; ++i)
00099         nfeat += len[i] - d1;
00100     group = (int *)malloc(nfeat*sizeof(int));
00101     features = (int *)malloc(nfeat*2*sizeof(int *));
00102     c = 0;
00103     for (i = 0; i < nStr; ++i)
00104     {
00105         n = len[i] - d1;
00106         s = S[i];
00107         for (j = 0; j < n; ++j)
00108         {
00109             features[c] = s[j];
00110             features[c+nfeat] = s[j+d1];
00111             group[c] = i;
00112             c++;
00113         }
00114     }
00115     if (nfeat!=c)
00116         printf("Something is wrong...\n");
00117     F = (SSKFeatures *)malloc(sizeof(SSKFeatures));
00118     (*F).features = features;
00119     (*F).group = group;
00120     (*F).n = nfeat;
00121     return F;
00122 }
00123 
00124 
00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b)
00126 {
00127     int d1, d2;
00128     SSKFeatures *features;
00129     int *sortIdx;
00130     int *features_srt;
00131     int *group_srt;
00132     int maxIdx;
00133     char *kernelfilename;
00134     int **S;
00135     int *len;
00136     int nStr, nfeat;
00137     int i;
00138     int na;
00139     int *K;
00140 
00141     for (d1 = 1; d1 <= d; ++d1)
00142     {
00143         if ( isVerbose ) printf("Extracting features..."), fflush( stdout );
00144         features = extractDouble(S,len,nStr,d1);
00145         nfeat = (*features).n;
00146         printf("d=%d: %d features\n", d1, nfeat);
00147         maxIdx = 0;
00148         for (i = 0; i < nfeat*2; ++i)
00149             maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00150         if (na < maxIdx+1)
00151         {
00152             printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00153             printf("\tUsing [0,%d] instead\n", maxIdx);
00154             na = maxIdx+1;
00155         }
00156         if (isVerbose)
00157         {
00158             printf("done.\n");
00159             printf("Sorting...");
00160             fflush( stdout );
00161         }
00162         sortIdx = cntsrtna((*features).features,2,(*features).n,na);
00163         if (isVerbose)  printf("done.\n");
00164         features_srt = (int *)malloc(nfeat*2*sizeof(int *));
00165         group_srt = (int *)malloc(nfeat*sizeof(int));
00166         for (i = 0; i < nfeat; ++i)
00167         {
00168             features_srt[i]=(*features).features[sortIdx[i]];
00169             features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00170             group_srt[i] = (*features).group[sortIdx[i]];
00171         }
00172         free(sortIdx);
00173         free((*features).features);
00174         free((*features).group);
00175         free(features);
00176         if (isVerbose)
00177         {
00178             printf("Counting...");
00179             fflush( stdout );
00180         }
00181         countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr);
00182         free(features_srt);
00183         free(group_srt);
00184         if (isVerbose)
00185         {
00186             printf("done.");
00187         }
00188     }
00189 }
00190 
00191 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b)
00192 {
00193     int d1, d2;
00194     SSKFeatures *features;
00195     int *sortIdx;
00196     int *features_srt;
00197     int *group_srt;
00198     int maxIdx;
00199     char *kernelfilename;
00200     int **S;
00201     int *len;
00202     int nStr, nfeat;
00203     int i;
00204     int na;
00205     int *K;
00206 
00207     for (d1 = 1; d1 <= d; ++d1)
00208     {
00209         for (d2 = 1; d2 <= d; ++d2)
00210         {
00211             if (isVerbose)
00212             {
00213                 printf("Extracting features...");
00214                 fflush( stdout );
00215             }
00216             features = extractTriple(S,len,nStr,d1,d2);
00217             nfeat = (*features).n;
00218             printf("(%d,%d): %d features\n", d1, d2, nfeat);
00219             maxIdx = 0;
00220             for (i = 0; i < nfeat*3; ++i)
00221                 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00222             if (na < maxIdx+1)
00223             {
00224                 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00225                 printf("\tUsing [0,%d] instead\n", maxIdx);
00226                 na = maxIdx+1;
00227             }
00228             if (isVerbose)
00229             {
00230                 printf("done.\n");
00231                 printf("Sorting...");
00232                 fflush( stdout );
00233             }
00234             sortIdx = cntsrtna((*features).features,3,(*features).n,na);
00235             if (isVerbose)
00236             {
00237                 printf("done.\n");
00238             }
00239             features_srt = (int *)malloc(nfeat*3*sizeof(int *));
00240             group_srt = (int *)malloc(nfeat*sizeof(int));
00241             for (i = 0; i < nfeat; ++i)
00242             {
00243                 features_srt[i]=(*features).features[sortIdx[i]];
00244                 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00245                 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat];
00246                 group_srt[i] = (*features).group[sortIdx[i]];
00247             }
00248             free(sortIdx);
00249             free((*features).features);
00250             free((*features).group);
00251             free(features);
00252             if (isVerbose)
00253             {
00254                 printf("Counting...");
00255                 fflush( stdout );
00256             }
00257             countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr);
00258             free(features_srt);
00259             free(group_srt);
00260             if (isVerbose)
00261             {
00262                 printf("done.\n");
00263             }
00264         }
00265     }
00266 }
00267 
00268 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr)
00269 {  
00270     char same;
00271     int i, j; 
00272     int cu;
00273     long int ucnt;
00274     long int startInd, endInd, j1;
00275     int *curfeat, *ucnts, *updind;
00276 
00277     curfeat = (int *)malloc(k*sizeof(int));
00278     ucnts= (int *)malloc(nStr*sizeof(int));
00279     updind = (int *)malloc(nStr*sizeof(int));
00280     i = 0;
00281     ucnt = 0;
00282     while (i<r)
00283     {  
00284         for (j = 0; j < k; ++j)
00285             curfeat[j]=sx[i+j*r];
00286         same=1;
00287         for (j = 0;j < k; ++j)
00288             if (curfeat[j]!=sx[i+j*r])
00289             { 
00290                 same=0;
00291                 break;
00292             }   
00293         startInd=i;
00294         while (same && i<r)
00295         {
00296             i++;
00297             if (i >= r) break;
00298             same = 1;
00299             for (j = 0; j < k; ++j)
00300                 if (curfeat[j]!=sx[i+j*r])
00301                 {
00302                     same=0;
00303                     break;
00304                 }
00305         }
00306         endInd= (i<r) ? (i-1):(r-1);
00307         ucnt++;
00308         if ((long int)endInd-startInd+1>2*nStr)
00309         {
00310             for (j = 0; j < nStr; ++j) ucnts[j]=0;
00311             for (j = startInd;j <= endInd; ++j)  ucnts[g[j]]++;
00312             cu=0;
00313             for (j=0;j<nStr;j++)
00314             {
00315                 if (ucnts[j]>0)
00316                 {
00317                     updind[cu] = j;
00318                     cu++;
00319                 }
00320             }
00321             for (j=0;j<cu;j++)
00322                 for (j1=0;j1<cu;j1++)
00323                     outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]];
00324         }
00325         else
00326         {
00327             for (j = startInd;j <= endInd; ++j)
00328                 for (j1 = startInd;j1 <= endInd; ++j1)
00329                     outK[ g[j]+nStr*g[j1] ]++;
00330         }
00331     }
00332     free(updind);
00333     free(ucnts);
00334     free(curfeat);
00335 }
00336 
00337 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na)
00338 {      
00339     int *sxc, *bc, *sxl, *cc, *regroup;
00340     int i, j;
00341 
00342     sxc = (int *)malloc(na*sizeof(int));
00343     bc  = (int *)malloc(na*sizeof(int));
00344     sxl = (int *)malloc(r*sizeof(int));
00345     cc  = (int *)malloc(r*sizeof(int));
00346     regroup = (int *)malloc(r*sizeof(int));
00347 
00348     for (i = 0; i < r; ++i)
00349         regroup[i]=i;
00350     for (j = k-1; j >= 0; --j)
00351     { 
00352         for(i = 0; i < na; ++i)
00353             sxc[i]=0;
00354         for (i = 0; i < r; ++i)
00355         {
00356             cc[i]=sx[regroup[i]+j*r];
00357             sxc[ cc[i] ]++;
00358         }
00359         bc[0]=0;
00360         for (i = 1;i < na; ++i)
00361             bc[i]=bc[i-1]+sxc[i-1];
00362         for (i = 0; i < r; ++i)
00363             sxl[bc[ cc[i] ]++] = regroup[i];
00364         for (i = 0; i < r; ++i)
00365             regroup[i] = sxl[i];
00366     }
00367     free(sxl); free(bc); free(sxc); free(cc);
00368 
00369     return regroup;
00370 }
00371 
00372 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b)
00373 {
00374     if (t==2)
00375         compute_double(idx_a, idx_b);
00376     if (t==3)
00377         compute_triple(idx_a, idx_b);
00378 
00379     SG_ERROR("t out of range - shouldn't happen\n");
00380     return -1;
00381 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation