SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SparseSpatialSampleStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  *
10  * Based on code from Pavel Kuksa <pkuksa@cs.rutgers.edu> and
11  * Vladimir Pavlovic <vladimir@cs.rutgers.edu> originally
12  * released under the new BSD License.
13  */
14 
15 #include <shogun/lib/common.h>
16 #include <shogun/io/SGIO.h>
20 
21 using namespace shogun;
22 
24 : CStringKernel<char>(0), t(2), d(5)
25 {
26 }
27 
29  CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5)
30 {
31  init(l, r);
32 }
33 
34 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r)
35 {
37  return init_normalizer();
38 }
39 
41 {
43 }
44 
46 {
47 }
48 
49 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2)
50 {
51  int i, j;
52  int n, nfeat;
53  int *group;
54  int *features;
55  int *s;
56  int c;
57  SSKFeatures *F;
58 
59  nfeat = 0;
60  for (i = 0; i < nStr; ++i)
61  nfeat += len[i] - d1 -d2;
62  group = SG_MALLOC(int, nfeat);
63  features = SG_MALLOC(int, nfeat*3);
64  c = 0;
65  for (i = 0; i < nStr; ++i)
66  {
67  n = len[i] - d1 - d2;
68  s = S[i];
69  for (j = 0; j < n; ++j)
70  {
71  features[c] = s[j];
72  features[c+nfeat] = s[j+d1];
73  features[c+2*nfeat] = s[j+d1+d2];
74  group[c] = i;
75  c++;
76  }
77  }
78  ASSERT(nfeat==c)
79  F = SG_MALLOC(SSKFeatures, 1);
80  (*F).features = features;
81  (*F).group = group;
82  (*F).n = nfeat;
83  return F;
84 }
85 
86 
87 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1)
88 {
89  int i, j;
90  int n, nfeat;
91  int *group;
92  int *features;
93  int *s;
94  int c;
95  SSKFeatures *F;
96 
97  nfeat = 0;
98  for (i = 0; i < nStr; ++i)
99  nfeat += len[i] - d1;
100  group = SG_MALLOC(int, nfeat);
101  features = SG_MALLOC(int, nfeat*2);
102  c = 0;
103  for (i = 0; i < nStr; ++i)
104  {
105  n = len[i] - d1;
106  s = S[i];
107  for (j = 0; j < n; ++j)
108  {
109  features[c] = s[j];
110  features[c+nfeat] = s[j+d1];
111  group[c] = i;
112  c++;
113  }
114  }
115  if (nfeat!=c)
116  printf("Something is wrong...\n");
117  F = SG_MALLOC(SSKFeatures, 1);
118  (*F).features = features;
119  (*F).group = group;
120  (*F).n = nfeat;
121  return F;
122 }
123 
124 
125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b)
126 {
127  int d1;
128  SSKFeatures *features;
129  int *sortIdx;
130  int *features_srt;
131  int *group_srt;
132  int maxIdx;
133  int **S=NULL;
134  int *len=NULL;
135  int nStr=0, nfeat;
136  int i;
137  int na=0;
138  int *K=NULL;
139 
140  for (d1 = 1; d1 <= d; ++d1)
141  {
142  if ( isVerbose ) printf("Extracting features..."), fflush( stdout );
143  features = extractDouble(S,len,nStr,d1);
144  nfeat = (*features).n;
145  printf("d=%d: %d features\n", d1, nfeat);
146  maxIdx = 0;
147  for (i = 0; i < nfeat*2; ++i)
148  maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
149  if (na < maxIdx+1)
150  {
151  printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
152  printf("\tUsing [0,%d] instead\n", maxIdx);
153  na = maxIdx+1;
154  }
155  if (isVerbose)
156  {
157  printf("done.\n");
158  printf("Sorting...");
159  fflush( stdout );
160  }
161  sortIdx = cntsrtna((*features).features,2,(*features).n,na);
162  if (isVerbose) printf("done.\n");
163  features_srt = SG_MALLOC(int, nfeat*2);
164  group_srt = SG_MALLOC(int, nfeat);
165  for (i = 0; i < nfeat; ++i)
166  {
167  features_srt[i]=(*features).features[sortIdx[i]];
168  features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
169  group_srt[i] = (*features).group[sortIdx[i]];
170  }
171  SG_FREE(sortIdx);
172  SG_FREE((*features).features);
173  SG_FREE((*features).group);
174  SG_FREE(features);
175  if (isVerbose)
176  {
177  printf("Counting...");
178  fflush( stdout );
179  }
180  countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr);
181  SG_FREE(features_srt);
182  SG_FREE(group_srt);
183  if (isVerbose)
184  {
185  printf("done.");
186  }
187  }
188 }
189 
190 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b)
191 {
192  int d1, d2;
193  SSKFeatures *features;
194  int *sortIdx;
195  int *features_srt;
196  int *group_srt;
197  int maxIdx;
198  int **S=NULL;
199  int *len=NULL;
200  int nStr=0, nfeat;
201  int i;
202  int na=0;
203  int *K=NULL;
204 
205  for (d1 = 1; d1 <= d; ++d1)
206  {
207  for (d2 = 1; d2 <= d; ++d2)
208  {
209  if (isVerbose)
210  {
211  printf("Extracting features...");
212  fflush( stdout );
213  }
214  features = extractTriple(S,len,nStr,d1,d2);
215  nfeat = (*features).n;
216  printf("(%d,%d): %d features\n", d1, d2, nfeat);
217  maxIdx = 0;
218  for (i = 0; i < nfeat*3; ++i)
219  maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i];
220  if (na < maxIdx+1)
221  {
222  printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na);
223  printf("\tUsing [0,%d] instead\n", maxIdx);
224  na = maxIdx+1;
225  }
226  if (isVerbose)
227  {
228  printf("done.\n");
229  printf("Sorting...");
230  fflush( stdout );
231  }
232  sortIdx = cntsrtna((*features).features,3,(*features).n,na);
233  if (isVerbose)
234  {
235  printf("done.\n");
236  }
237  features_srt = SG_MALLOC(int, nfeat*3);
238  group_srt = SG_MALLOC(int, nfeat);
239  for (i = 0; i < nfeat; ++i)
240  {
241  features_srt[i]=(*features).features[sortIdx[i]];
242  features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat];
243  features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat];
244  group_srt[i] = (*features).group[sortIdx[i]];
245  }
246  SG_FREE(sortIdx);
247  SG_FREE((*features).features);
248  SG_FREE((*features).group);
249  SG_FREE(features);
250  if (isVerbose)
251  {
252  printf("Counting...");
253  fflush( stdout );
254  }
255  countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr);
256  SG_FREE(features_srt);
257  SG_FREE(group_srt);
258  if (isVerbose)
259  {
260  printf("done.\n");
261  }
262  }
263  }
264 }
265 
266 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr)
267 {
268  char same;
269  int i, j;
270  int cu;
271  long int ucnt;
272  long int startInd, endInd, j1;
273  int *curfeat, *ucnts, *updind;
274 
275  curfeat = SG_MALLOC(int, k);
276  ucnts = SG_MALLOC(int, nStr);
277  updind = SG_MALLOC(int, nStr);
278  i = 0;
279  ucnt = 0;
280  while (i<r)
281  {
282  for (j = 0; j < k; ++j)
283  curfeat[j]=sx[i+j*r];
284  same=1;
285  for (j = 0;j < k; ++j)
286  if (curfeat[j]!=sx[i+j*r])
287  {
288  same=0;
289  break;
290  }
291  startInd=i;
292  while (same && i<r)
293  {
294  i++;
295  if (i >= r) break;
296  same = 1;
297  for (j = 0; j < k; ++j)
298  if (curfeat[j]!=sx[i+j*r])
299  {
300  same=0;
301  break;
302  }
303  }
304  endInd= (i<r) ? (i-1):(r-1);
305  ucnt++;
306  if ((long int)endInd-startInd+1>2*nStr)
307  {
308  for (j = 0; j < nStr; ++j) ucnts[j]=0;
309  for (j = startInd;j <= endInd; ++j) ucnts[g[j]]++;
310  cu=0;
311  for (j=0;j<nStr;j++)
312  {
313  if (ucnts[j]>0)
314  {
315  updind[cu] = j;
316  cu++;
317  }
318  }
319  for (j=0;j<cu;j++)
320  for (j1=0;j1<cu;j1++)
321  outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]];
322  }
323  else
324  {
325  for (j = startInd;j <= endInd; ++j)
326  for (j1 = startInd;j1 <= endInd; ++j1)
327  outK[ g[j]+nStr*g[j1] ]++;
328  }
329  }
330  SG_FREE(updind);
331  SG_FREE(ucnts);
332  SG_FREE(curfeat);
333 }
334 
335 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na)
336 {
337  int *sxc, *bc, *sxl, *cc, *regroup;
338  int i, j;
339 
340  sxc = SG_MALLOC(int, na);
341  bc = SG_MALLOC(int, na);
342  sxl = SG_MALLOC(int, r);
343  cc = SG_MALLOC(int, r);
344  regroup = SG_MALLOC(int, r);
345 
346  for (i = 0; i < r; ++i)
347  regroup[i]=i;
348  for (j = k-1; j >= 0; --j)
349  {
350  for(i = 0; i < na; ++i)
351  sxc[i]=0;
352  for (i = 0; i < r; ++i)
353  {
354  cc[i]=sx[regroup[i]+j*r];
355  sxc[ cc[i] ]++;
356  }
357  bc[0]=0;
358  for (i = 1;i < na; ++i)
359  bc[i]=bc[i-1]+sxc[i-1];
360  for (i = 0; i < r; ++i)
361  sxl[bc[ cc[i] ]++] = regroup[i];
362  for (i = 0; i < r; ++i)
363  regroup[i] = sxl[i];
364  }
365  SG_FREE(sxl); SG_FREE(bc); SG_FREE(sxc); SG_FREE(cc);
366 
367  return regroup;
368 }
369 
371 {
372  if (t==2)
373  compute_double(idx_a, idx_b);
374  if (t==3)
375  compute_triple(idx_a, idx_b);
376 
377  SG_ERROR("t out of range - shouldn't happen\n")
378  return -1;
379 }

SHOGUN Machine Learning Toolbox - Documentation