SparseSpatialSampleStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  *
00010  * Based on code from Pavel Kuksa <pkuksa@cs.rutgers.edu> and
00011  * Vladimir Pavlovic <vladimir@cs.rutgers.edu> originally
00012  * released under the new BSD License.
00013  */
00014 
00015 #include <shogun/lib/common.h>
00016 #include <shogun/io/SGIO.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/kernel/SparseSpatialSampleStringKernel.h>
00019 #include <shogun/features/StringFeatures.h>
00020 
00021 using namespace shogun;
00022 
00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel()
00024 : CStringKernel<char>(0), t(2), d(5)
00025 {
00026 }
00027 
00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l,
00029         CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5)
00030 {
00031     init(l, r);
00032 }
00033 
00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r)
00035 {
00036     CStringKernel<char>::init(l, r);
00037     return init_normalizer();
00038 }
00039 
00040 void CSparseSpatialSampleStringKernel::cleanup()
00041 {
00042     CKernel::cleanup();
00043 }
00044 
00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel()
00046 {
00047 }
00048 
00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2)
00050 {
00051     int i, j;
00052     int n, nfeat;
00053     int *group;
00054     int *features;
00055     int *s;
00056     int c;
00057     SSKFeatures *F;
00058 
00059     nfeat = 0;
00060     for (i = 0; i < nStr; ++i)
00061         nfeat += len[i] - d1 -d2;
00062     group = SG_MALLOC(int, nfeat);
00063     features = SG_MALLOC(int, nfeat*3);
00064     c = 0;
00065     for (i = 0; i < nStr; ++i)
00066     {
00067         n = len[i] - d1 - d2;
00068         s = S[i];
00069         for (j = 0; j < n; ++j)
00070         {
00071             features[c] = s[j];
00072             features[c+nfeat] = s[j+d1];
00073             features[c+2*nfeat] = s[j+d1+d2];
00074             group[c] = i;
00075             c++;
00076         }
00077     }
00078     ASSERT(nfeat==c);
00079     F = SG_MALLOC(SSKFeatures, 1);
00080     (*F).features = features;
00081     (*F).group = group;
00082     (*F).n = nfeat;
00083     return F;
00084 }
00085 
00086 
00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1)
00088 {
00089     int i, j;
00090     int n, nfeat;
00091     int *group;
00092     int *features;
00093     int *s;
00094     int c;
00095     SSKFeatures *F;
00096 
00097     nfeat = 0;
00098     for (i = 0; i < nStr; ++i)
00099         nfeat += len[i] - d1;
00100     group = SG_MALLOC(int, nfeat);
00101     features = SG_MALLOC(int, nfeat*2);
00102     c = 0;
00103     for (i = 0; i < nStr; ++i)
00104     {
00105         n = len[i] - d1;
00106         s = S[i];
00107         for (j = 0; j < n; ++j)
00108         {
00109             features[c] = s[j];
00110             features[c+nfeat] = s[j+d1];
00111             group[c] = i;
00112             c++;
00113         }
00114     }
00115     if (nfeat!=c)
00116         printf("Something is wrong...\n");
00117     F = SG_MALLOC(SSKFeatures, 1);
00118     (*F).features = features;
00119     (*F).group = group;
00120     (*F).n = nfeat;
00121     return F;
00122 }
00123 
00124 
00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b)
00126 {
00127     int d1;
00128     SSKFeatures *features;
00129     int *sortIdx;
00130     int *features_srt;
00131     int *group_srt;
00132     int maxIdx;
00133     int **S=NULL;
00134     int *len=NULL;
00135     int nStr=0, nfeat;
00136     int i;
00137     int na=0;
00138     int *K=NULL;
00139 
00140     for (d1 = 1; d1 <= d; ++d1)
00141     {
00142         if ( isVerbose ) printf("Extracting features..."), fflush( stdout );
00143         features = extractDouble(S,len,nStr,d1);
00144         nfeat = (*features).n;
00145         printf("d=%d: %d features\n", d1, nfeat);
00146         maxIdx = 0;
00147         for (i = 0; i < nfeat*2; ++i)
00148             maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00149         if (na < maxIdx+1)
00150         {
00151             printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00152             printf("\tUsing [0,%d] instead\n", maxIdx);
00153             na = maxIdx+1;
00154         }
00155         if (isVerbose)
00156         {
00157             printf("done.\n");
00158             printf("Sorting...");
00159             fflush( stdout );
00160         }
00161         sortIdx = cntsrtna((*features).features,2,(*features).n,na);
00162         if (isVerbose)  printf("done.\n");
00163         features_srt = SG_MALLOC(int, nfeat*2);
00164         group_srt = SG_MALLOC(int, nfeat);
00165         for (i = 0; i < nfeat; ++i)
00166         {
00167             features_srt[i]=(*features).features[sortIdx[i]];
00168             features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00169             group_srt[i] = (*features).group[sortIdx[i]];
00170         }
00171         SG_FREE(sortIdx);
00172         SG_FREE((*features).features);
00173         SG_FREE((*features).group);
00174         SG_FREE(features);
00175         if (isVerbose)
00176         {
00177             printf("Counting...");
00178             fflush( stdout );
00179         }
00180         countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr);
00181         SG_FREE(features_srt);
00182         SG_FREE(group_srt);
00183         if (isVerbose)
00184         {
00185             printf("done.");
00186         }
00187     }
00188 }
00189 
00190 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b)
00191 {
00192     int d1, d2;
00193     SSKFeatures *features;
00194     int *sortIdx;
00195     int *features_srt;
00196     int *group_srt;
00197     int maxIdx;
00198     int **S=NULL;
00199     int *len=NULL;
00200     int nStr=0, nfeat;
00201     int i;
00202     int na=0;
00203     int *K=NULL;
00204 
00205     for (d1 = 1; d1 <= d; ++d1)
00206     {
00207         for (d2 = 1; d2 <= d; ++d2)
00208         {
00209             if (isVerbose)
00210             {
00211                 printf("Extracting features...");
00212                 fflush( stdout );
00213             }
00214             features = extractTriple(S,len,nStr,d1,d2);
00215             nfeat = (*features).n;
00216             printf("(%d,%d): %d features\n", d1, d2, nfeat);
00217             maxIdx = 0;
00218             for (i = 0; i < nfeat*3; ++i)
00219                 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
00220             if (na < maxIdx+1)
00221             {
00222                 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
00223                 printf("\tUsing [0,%d] instead\n", maxIdx);
00224                 na = maxIdx+1;
00225             }
00226             if (isVerbose)
00227             {
00228                 printf("done.\n");
00229                 printf("Sorting...");
00230                 fflush( stdout );
00231             }
00232             sortIdx = cntsrtna((*features).features,3,(*features).n,na);
00233             if (isVerbose)
00234             {
00235                 printf("done.\n");
00236             }
00237             features_srt = SG_MALLOC(int, nfeat*3);
00238             group_srt = SG_MALLOC(int, nfeat);
00239             for (i = 0; i < nfeat; ++i)
00240             {
00241                 features_srt[i]=(*features).features[sortIdx[i]];
00242                 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
00243                 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat];
00244                 group_srt[i] = (*features).group[sortIdx[i]];
00245             }
00246             SG_FREE(sortIdx);
00247             SG_FREE((*features).features);
00248             SG_FREE((*features).group);
00249             SG_FREE(features);
00250             if (isVerbose)
00251             {
00252                 printf("Counting...");
00253                 fflush( stdout );
00254             }
00255             countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr);
00256             SG_FREE(features_srt);
00257             SG_FREE(group_srt);
00258             if (isVerbose)
00259             {
00260                 printf("done.\n");
00261             }
00262         }
00263     }
00264 }
00265 
00266 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr)
00267 {  
00268     char same;
00269     int i, j; 
00270     int cu;
00271     long int ucnt;
00272     long int startInd, endInd, j1;
00273     int *curfeat, *ucnts, *updind;
00274 
00275     curfeat = SG_MALLOC(int, k);
00276     ucnts = SG_MALLOC(int, nStr);
00277     updind = SG_MALLOC(int, nStr);
00278     i = 0;
00279     ucnt = 0;
00280     while (i<r)
00281     {  
00282         for (j = 0; j < k; ++j)
00283             curfeat[j]=sx[i+j*r];
00284         same=1;
00285         for (j = 0;j < k; ++j)
00286             if (curfeat[j]!=sx[i+j*r])
00287             { 
00288                 same=0;
00289                 break;
00290             }   
00291         startInd=i;
00292         while (same && i<r)
00293         {
00294             i++;
00295             if (i >= r) break;
00296             same = 1;
00297             for (j = 0; j < k; ++j)
00298                 if (curfeat[j]!=sx[i+j*r])
00299                 {
00300                     same=0;
00301                     break;
00302                 }
00303         }
00304         endInd= (i<r) ? (i-1):(r-1);
00305         ucnt++;
00306         if ((long int)endInd-startInd+1>2*nStr)
00307         {
00308             for (j = 0; j < nStr; ++j) ucnts[j]=0;
00309             for (j = startInd;j <= endInd; ++j)  ucnts[g[j]]++;
00310             cu=0;
00311             for (j=0;j<nStr;j++)
00312             {
00313                 if (ucnts[j]>0)
00314                 {
00315                     updind[cu] = j;
00316                     cu++;
00317                 }
00318             }
00319             for (j=0;j<cu;j++)
00320                 for (j1=0;j1<cu;j1++)
00321                     outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]];
00322         }
00323         else
00324         {
00325             for (j = startInd;j <= endInd; ++j)
00326                 for (j1 = startInd;j1 <= endInd; ++j1)
00327                     outK[ g[j]+nStr*g[j1] ]++;
00328         }
00329     }
00330     SG_FREE(updind);
00331     SG_FREE(ucnts);
00332     SG_FREE(curfeat);
00333 }
00334 
00335 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na)
00336 {      
00337     int *sxc, *bc, *sxl, *cc, *regroup;
00338     int i, j;
00339 
00340     sxc = SG_MALLOC(int, na);
00341     bc  = SG_MALLOC(int, na);
00342     sxl = SG_MALLOC(int, r);
00343     cc  = SG_MALLOC(int, r);
00344     regroup = SG_MALLOC(int, r);
00345 
00346     for (i = 0; i < r; ++i)
00347         regroup[i]=i;
00348     for (j = k-1; j >= 0; --j)
00349     { 
00350         for(i = 0; i < na; ++i)
00351             sxc[i]=0;
00352         for (i = 0; i < r; ++i)
00353         {
00354             cc[i]=sx[regroup[i]+j*r];
00355             sxc[ cc[i] ]++;
00356         }
00357         bc[0]=0;
00358         for (i = 1;i < na; ++i)
00359             bc[i]=bc[i-1]+sxc[i-1];
00360         for (i = 0; i < r; ++i)
00361             sxl[bc[ cc[i] ]++] = regroup[i];
00362         for (i = 0; i < r; ++i)
00363             regroup[i] = sxl[i];
00364     }
00365     SG_FREE(sxl); SG_FREE(bc); SG_FREE(sxc); SG_FREE(cc);
00366 
00367     return regroup;
00368 }
00369 
00370 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b)
00371 {
00372     if (t==2)
00373         compute_double(idx_a, idx_b);
00374     if (t==3)
00375         compute_triple(idx_a, idx_b);
00376 
00377     SG_ERROR("t out of range - shouldn't happen\n");
00378     return -1;
00379 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation