SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CommWordStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
12 #include <shogun/io/SGIO.h>
13 
14 #include <shogun/base/Parameter.h>
15 
19 
20 using namespace shogun;
21 
23 : CStringKernel<uint16_t>()
24 {
25  init();
26 }
27 
29 : CStringKernel<uint16_t>(size)
30 {
31  init();
32  use_sign=s;
33 }
34 
37  bool s, int32_t size) : CStringKernel<uint16_t>(size)
38 {
39  init();
40  use_sign=s;
41 
42  init(l,r);
43 }
44 
45 
47 {
48  dictionary_size= size;
49  SG_FREE(dictionary_weights);
50  dictionary_weights=SG_MALLOC(float64_t, size);
51  SG_DEBUG("using dictionary of %d words\n", size)
52  clear_normal();
53 
54  return dictionary_weights!=NULL;
55 }
56 
58 {
59  cleanup();
60 
61  SG_FREE(dictionary_weights);
63 }
64 
65 bool CCommWordStringKernel::init(CFeatures* l, CFeatures* r)
66 {
68 
70  {
72  dict_diagonal_optimization=SG_MALLOC(int32_t, int32_t(((CStringFeatures<uint16_t>*)l)->get_num_symbols()));
73  ASSERT(((CStringFeatures<uint16_t>*)l)->get_num_symbols() == ((CStringFeatures<uint16_t>*)r)->get_num_symbols())
74  }
75 
76  return init_normalizer();
77 }
78 
80 {
83 }
84 
86 {
87  int32_t alen;
90 
91  bool free_av;
92  uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
93 
94  float64_t result=0.0 ;
95  ASSERT(l==r)
96  ASSERT(sizeof(uint16_t)<=sizeof(float64_t))
97  ASSERT((1<<(sizeof(uint16_t)*8)) > alen)
98 
99  int32_t num_symbols=(int32_t) l->get_num_symbols();
100  ASSERT(num_symbols<=dictionary_size)
101 
102  int32_t* dic = dict_diagonal_optimization;
103  memset(dic, 0, num_symbols*sizeof(int32_t));
104 
105  for (int32_t i=0; i<alen; i++)
106  dic[av[i]]++;
107 
108  if (use_sign)
109  {
110  for (int32_t i=0; i<(int32_t) l->get_num_symbols(); i++)
111  {
112  if (dic[i]!=0)
113  result++;
114  }
115  }
116  else
117  {
118  for (int32_t i=0; i<num_symbols; i++)
119  {
120  if (dic[i]!=0)
121  result+=dic[i]*dic[i];
122  }
123  }
124  l->free_feature_vector(av, idx_a, free_av);
125 
126  return result;
127 }
128 
130  int32_t idx_a, int32_t idx_b, bool do_sort)
131 {
132  int32_t alen, blen;
133  bool free_av, free_bv;
134 
137 
138  uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
139  uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bv);
140 
141  uint16_t* avec=av;
142  uint16_t* bvec=bv;
143 
144  if (do_sort)
145  {
146  if (alen>0)
147  {
148  avec=SG_MALLOC(uint16_t, alen);
149  memcpy(avec, av, sizeof(uint16_t)*alen);
150  CMath::radix_sort(avec, alen);
151  }
152  else
153  avec=NULL;
154 
155  if (blen>0)
156  {
157  bvec=SG_MALLOC(uint16_t, blen);
158  memcpy(bvec, bv, sizeof(uint16_t)*blen);
159  CMath::radix_sort(bvec, blen);
160  }
161  else
162  bvec=NULL;
163  }
164  else
165  {
166  if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
168  {
169  SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
170  " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
172  }
173  }
174 
175  float64_t result=0;
176 
177  int32_t left_idx=0;
178  int32_t right_idx=0;
179 
180  if (use_sign)
181  {
182  while (left_idx < alen && right_idx < blen)
183  {
184  if (avec[left_idx]==bvec[right_idx])
185  {
186  uint16_t sym=avec[left_idx];
187 
188  while (left_idx< alen && avec[left_idx]==sym)
189  left_idx++;
190 
191  while (right_idx< blen && bvec[right_idx]==sym)
192  right_idx++;
193 
194  result++;
195  }
196  else if (avec[left_idx]<bvec[right_idx])
197  left_idx++;
198  else
199  right_idx++;
200  }
201  }
202  else
203  {
204  while (left_idx < alen && right_idx < blen)
205  {
206  if (avec[left_idx]==bvec[right_idx])
207  {
208  int32_t old_left_idx=left_idx;
209  int32_t old_right_idx=right_idx;
210 
211  uint16_t sym=avec[left_idx];
212 
213  while (left_idx< alen && avec[left_idx]==sym)
214  left_idx++;
215 
216  while (right_idx< blen && bvec[right_idx]==sym)
217  right_idx++;
218 
219  result+=((float64_t) (left_idx-old_left_idx))*
220  ((float64_t) (right_idx-old_right_idx));
221  }
222  else if (avec[left_idx]<bvec[right_idx])
223  left_idx++;
224  else
225  right_idx++;
226  }
227  }
228 
229  if (do_sort)
230  {
231  SG_FREE(avec);
232  SG_FREE(bvec);
233  }
234 
235  l->free_feature_vector(av, idx_a, free_av);
236  r->free_feature_vector(bv, idx_b, free_bv);
237 
238  return result;
239 }
240 
241 void CCommWordStringKernel::add_to_normal(int32_t vec_idx, float64_t weight)
242 {
243  int32_t len=-1;
244  bool free_vec;
245  uint16_t* vec=((CStringFeatures<uint16_t>*) lhs)->
246  get_feature_vector(vec_idx, len, free_vec);
247 
248  if (len>0)
249  {
250  int32_t j, last_j=0;
251  if (use_sign)
252  {
253  for (j=1; j<len; j++)
254  {
255  if (vec[j]==vec[j-1])
256  continue;
257 
258  dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
259  normalize_lhs(weight, vec_idx);
260  }
261 
262  dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
263  normalize_lhs(weight, vec_idx);
264  }
265  else
266  {
267  for (j=1; j<len; j++)
268  {
269  if (vec[j]==vec[j-1])
270  continue;
271 
272  dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
273  normalize_lhs(weight*(j-last_j), vec_idx);
274  last_j = j;
275  }
276 
277  dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
278  normalize_lhs(weight*(len-last_j), vec_idx);
279  }
280  set_is_initialized(true);
281  }
282 
283  ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
284 }
285 
287 {
288  memset(dictionary_weights, 0, dictionary_size*sizeof(float64_t));
289  set_is_initialized(false);
290 }
291 
293  int32_t count, int32_t* IDX, float64_t* weights)
294 {
296 
297  if (count<=0)
298  {
299  set_is_initialized(true);
300  SG_DEBUG("empty set of SVs\n")
301  return true;
302  }
303 
304  SG_DEBUG("initializing CCommWordStringKernel optimization\n")
305 
306  for (int32_t i=0; i<count; i++)
307  {
308  if ( (i % (count/10+1)) == 0)
309  SG_PROGRESS(i, 0, count)
310 
311  add_to_normal(IDX[i], weights[i]);
312  }
313 
314  set_is_initialized(true);
315  return true;
316 }
317 
319 {
320  SG_DEBUG("deleting CCommWordStringKernel optimization\n")
321 
322  clear_normal();
323  return true;
324 }
325 
327 {
328  if (!get_is_initialized())
329  {
330  SG_ERROR("CCommWordStringKernel optimization not initialized\n")
331  return 0 ;
332  }
333 
334  float64_t result = 0;
335  int32_t len = -1;
336  bool free_vec;
337  uint16_t* vec=((CStringFeatures<uint16_t>*) rhs)->
338  get_feature_vector(i, len, free_vec);
339 
340  int32_t j, last_j=0;
341  if (vec && len>0)
342  {
343  if (use_sign)
344  {
345  for (j=1; j<len; j++)
346  {
347  if (vec[j]==vec[j-1])
348  continue;
349 
350  result += dictionary_weights[(int32_t) vec[j-1]];
351  }
352 
353  result += dictionary_weights[(int32_t) vec[len-1]];
354  }
355  else
356  {
357  for (j=1; j<len; j++)
358  {
359  if (vec[j]==vec[j-1])
360  continue;
361 
362  result += dictionary_weights[(int32_t) vec[j-1]]*(j-last_j);
363  last_j = j;
364  }
365 
366  result += dictionary_weights[(int32_t) vec[len-1]]*(len-last_j);
367  }
368 
369  result=normalizer->normalize_rhs(result, i);
370  }
371  ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(vec, i, free_vec);
372  return result;
373 }
374 
376  int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
377  int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
378 {
379  ASSERT(lhs)
381  num_feat=1;//str->get_max_vector_length();
382  CAlphabet* alpha=str->get_alphabet();
383  ASSERT(alpha)
384  int32_t num_bits=alpha->get_num_bits();
385  int32_t order=str->get_order();
386  ASSERT(max_degree<=order)
387  //int32_t num_words=(int32_t) str->get_num_symbols();
388  int32_t num_words=(int32_t) str->get_original_num_symbols();
389  int32_t offset=0;
390 
391  num_sym=0;
392 
393  for (int32_t i=0; i<order; i++)
394  num_sym+=CMath::pow((int32_t) num_words,i+1);
395 
396  SG_DEBUG("num_words:%d, order:%d, len:%d sz:%d (len*sz:%d)\n", num_words, order,
397  num_feat, num_sym, num_feat*num_sym);
398 
399  if (!target)
400  target=SG_MALLOC(float64_t, num_feat*num_sym);
401  memset(target, 0, num_feat*num_sym*sizeof(float64_t));
402 
403  if (do_init)
404  init_optimization(num_suppvec, IDX, alphas);
405 
406  uint32_t kmer_mask=0;
407  uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
408 
409  for (int32_t o=0; o<max_degree; o++)
410  {
411  float64_t* contrib=&target[offset];
412  offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
413 
414  kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
415 
416  for (int32_t p=-o; p<order; p++)
417  {
418  int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
419  uint32_t imer_mask=kmer_mask;
420  uint32_t jmer_mask=kmer_mask;
421 
422  if (p<0)
423  {
424  il=-p;
425  m_sym=order-o-p-1;
426  o_sym=-p;
427  }
428  else if (p<order-o)
429  {
430  ir=p;
431  m_sym=order-o-1;
432  }
433  else
434  {
435  ir=p;
436  m_sym=p;
437  o_sym=p-order+o+1;
438  jl=order-ir;
439  imer_mask=(kmer_mask>>(num_bits*o_sym));
440  jmer_mask=(kmer_mask>>(num_bits*jl));
441  }
442 
443  float64_t marginalizer=
444  1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
445 
446  for (uint32_t i=0; i<words; i++)
447  {
448  uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
449 
450  if (p>=0 && p<order-o)
451  {
452 //#define DEBUG_COMMSCORING
453 #ifdef DEBUG_COMMSCORING
454  SG_PRINT("o=%d/%d p=%d/%d i=0x%x x=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d marg=%g o_sym:%d m_sym:%d weight(",
455  o,order, p,order, i, x, imer_mask, jmer_mask, kmer_mask, il, ir, marginalizer, o_sym, m_sym);
456 
457  SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
458  alpha->remap_to_char((x>>(3*num_bits))&0x03), alpha->remap_to_char((x>>(2*num_bits))&0x03),
459  alpha->remap_to_char((x>>num_bits)&0x03), alpha->remap_to_char(x&0x03),
460  alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
461  alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
462  dictionary_weights[i]*marginalizer, dictionary_weights[i]);
463 #endif
464  contrib[x]+=dictionary_weights[i]*marginalizer;
465  }
466  else
467  {
468  for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
469  {
470  uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
471 #ifdef DEBUG_COMMSCORING
472 
473  SG_PRINT("o=%d/%d p=%d/%d i=0x%x j=0x%x x=0x%x c=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d jl=%d marg=%g o_sym:%d m_sym:%d weight(",
474  o,order, p,order, i, j, x, c, imer_mask, jmer_mask, kmer_mask, il, ir, jl, marginalizer, o_sym, m_sym);
475  SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
476  alpha->remap_to_char((c>>(3*num_bits))&0x03), alpha->remap_to_char((c>>(2*num_bits))&0x03),
477  alpha->remap_to_char((c>>num_bits)&0x03), alpha->remap_to_char(c&0x03),
478  alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
479  alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
480  dictionary_weights[i]*marginalizer, dictionary_weights[i]);
481 #endif
482  contrib[c]+=dictionary_weights[i]*marginalizer;
483  }
484  }
485  }
486  }
487  }
488 
489  for (int32_t i=1; i<num_feat; i++)
490  memcpy(&target[num_sym*i], target, num_sym*sizeof(float64_t));
491 
492  SG_UNREF(alpha);
493 
494  return target;
495 }
496 
497 
499  int32_t &result_len, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
500 {
501  ASSERT(lhs)
502  ASSERT(IDX)
503  ASSERT(alphas)
504 
506  int32_t num_words=(int32_t) str->get_num_symbols();
507  int32_t num_feat=str->get_max_vector_length();
508  int64_t total_len=((int64_t) num_feat) * num_words;
510  ASSERT(alpha)
511  int32_t num_bits=alpha->get_num_bits();
512  int32_t order=str->get_order();
513  int32_t max_idx=-1;
514  float64_t max_score=0;
515  result_len=num_feat+order-1;
516 
517  //init
518  init_optimization(num_suppvec, IDX, alphas);
519 
520  char* result=SG_MALLOC(char, result_len);
521  int32_t* bt=SG_MALLOC(int32_t, total_len);
522  float64_t* score=SG_MALLOC(float64_t, total_len);
523 
524  for (int64_t i=0; i<total_len; i++)
525  {
526  bt[i]=-1;
527  score[i]=0;
528  }
529 
530  for (int32_t t=0; t<num_words; t++)
531  score[t]=dictionary_weights[t];
532 
533  //dynamic program
534  for (int32_t i=1; i<num_feat; i++)
535  {
536  for (int32_t t1=0; t1<num_words; t1++)
537  {
538  max_idx=-1;
539  max_score=0;
540 
541  /* ignore weights the svm does not care about
542  * (has not seen in training). note that this assumes that zero
543  * weights are very unlikely to appear elsewise */
544 
545  //if (dictionary_weights[t1]==0.0)
546  //continue;
547 
548  /* iterate over words t ending on t1 and find the highest scoring
549  * pair */
550  uint16_t suffix=(uint16_t) t1 >> num_bits;
551 
552  for (int32_t sym=0; sym<str->get_original_num_symbols(); sym++)
553  {
554  uint16_t t=suffix | sym << (num_bits*(order-1));
555 
556  //if (dictionary_weights[t]==0.0)
557  // continue;
558 
559  float64_t sc=score[num_words*(i-1) + t]+dictionary_weights[t1];
560  if (sc > max_score || max_idx==-1)
561  {
562  max_idx=t;
563  max_score=sc;
564  }
565  }
566  ASSERT(max_idx!=-1)
567 
568  score[num_words*i + t1]=max_score;
569  bt[num_words*i + t1]=max_idx;
570  }
571  }
572 
573  //backtracking
574  max_idx=0;
575  max_score=score[num_words*(num_feat-1) + 0];
576  for (int32_t t=1; t<num_words; t++)
577  {
578  float64_t sc=score[num_words*(num_feat-1) + t];
579  if (sc>max_score)
580  {
581  max_idx=t;
582  max_score=sc;
583  }
584  }
585 
586  SG_DEBUG("max_idx:%i, max_score:%f\n", max_idx, max_score)
587 
588  for (int32_t i=result_len-1; i>=num_feat; i--)
589  result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(result_len-1-i)), 1) );
590 
591  for (int32_t i=num_feat-1; i>=0; i--)
592  {
593  result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(order-1)), 1) );
594  max_idx=bt[num_words*i + max_idx];
595  }
596 
597  SG_FREE(bt);
598  SG_FREE(score);
599  SG_UNREF(alpha);
600  return result;
601 }
602 
603 void CCommWordStringKernel::init()
604 {
605  dictionary_size=0;
606  dictionary_weights=NULL;
607 
608  use_sign=false;
611 
613  init_dictionary(1<<(sizeof(uint16_t)*8));
615 
616  m_parameters->add_vector(&dictionary_weights, &dictionary_size, "dictionary_weights",
617  "Dictionary for applying kernel.");
618  SG_ADD(&use_sign, "use_sign",
619  "If signum(counts) is used instead of counts.", MS_AVAILABLE);
621  "use_dict_diagonal_optimization", "If K(x,x) is computed potentially "
622  "more efficiently.", MS_NOT_AVAILABLE);
623 }

SHOGUN Machine Learning Toolbox - Documentation