SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
WeightedDegreePositionStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #include <shogun/lib/common.h>
13 #include <shogun/io/SGIO.h>
14 #include <shogun/lib/Signal.h>
15 #include <shogun/lib/Trie.h>
16 #include <shogun/base/Parallel.h>
17 
22 
24 
25 #ifdef HAVE_PTHREAD
26 #include <pthread.h>
27 #endif
28 
29 using namespace shogun;
30 
31 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
32 
33 #ifndef DOXYGEN_SHOULD_SKIP_THIS
34 template <class Trie> struct S_THREAD_PARAM_WDS
35 {
36  int32_t* vec;
37  float64_t* result;
38  float64_t* weights;
40  CTrie<Trie>* tries;
41  float64_t factor;
42  int32_t j;
43  int32_t start;
44  int32_t end;
45  int32_t length;
46  int32_t max_shift;
47  int32_t* shift;
48  int32_t* vec_idx;
49 };
50 #endif // DOXYGEN_SHOULD_SKIP_THIS
51 
53  void)
54 : CStringKernel<char>()
55 {
56  init();
57 }
58 
60  int32_t size, int32_t d, int32_t mm, int32_t mkls)
61 : CStringKernel<char>(size)
62 {
63  init();
64 
65  mkl_stepsize=mkls;
66  degree=d;
67  max_mismatch=mm;
68 
71 
74 }
75 
77  int32_t size, SGVector<float64_t> w, int32_t d, int32_t mm, SGVector<int32_t> s,
78  int32_t mkls)
79 : CStringKernel<char>(size)
80 {
81  init();
82 
83  mkl_stepsize=mkls;
84  degree=d;
85  max_mismatch=mm;
86 
89 
90  weights=SG_MALLOC(float64_t, d*(1+max_mismatch));
93 
94  for (int32_t i=0; i<d*(1+max_mismatch); i++)
95  weights[i]=w[i];
96 
97  set_shifts(s);
98 }
99 
102 : CStringKernel<char>()
103 {
104  init();
105 
106  mkl_stepsize=1;
107  degree=d;
108 
111 
112  set_wd_weights();
113  ASSERT(weights)
114 
115  init(l, r);
116 }
117 
118 
120 {
121  cleanup();
122  cleanup_POIM2();
123 
124  SG_FREE(shift);
125  shift=NULL;
126 
127  SG_FREE(weights);
128  weights=NULL;
129  weights_degree=0;
130  weights_length=0;
131 
132  SG_FREE(block_weights);
133  block_weights=NULL;
134 
135  SG_FREE(position_weights);
136  position_weights=NULL;
137 
138  SG_FREE(position_weights_lhs);
140 
141  SG_FREE(position_weights_rhs);
143 
144  SG_FREE(weights_buffer);
145  weights_buffer=NULL;
146 }
147 
149 {
150  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
152 
153  tries.destroy();
155 
157 }
158 
160 {
161  ASSERT(lhs)
162  seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
163 
165  {
166  tries.create(seq_length, true);
168  }
169  else if (opt_type==FASTBUTMEMHUNGRY)
170  {
171  tries.create(seq_length, false); // still buggy
172  poim_tries.create(seq_length, false); // still buggy
173  }
174  else
175  SG_ERROR("unknown optimization type\n")
176 }
177 
178 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
179 {
180  int32_t lhs_changed = (lhs!=l) ;
181  int32_t rhs_changed = (rhs!=r) ;
182 
184 
185  SG_DEBUG("lhs_changed: %i\n", lhs_changed)
186  SG_DEBUG("rhs_changed: %i\n", rhs_changed)
187 
190 
191  /* set shift */
192  if (shift_len==0) {
193  shift_len=sf_l->get_vector_length(0);
194  int32_t *shifts=SG_MALLOC(int32_t, shift_len);
195  for (int32_t i=0; i<shift_len; i++) {
196  shifts[i]=1;
197  }
198  set_shifts(SGVector<int32_t>(shifts, shift_len, false));
199  SG_FREE(shifts);
200  }
201 
202 
203  int32_t len=sf_l->get_max_vector_length();
204  if (lhs_changed && !sf_l->have_same_length(len))
205  SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n")
206 
207  if (rhs_changed && !sf_r->have_same_length(len))
208  SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n")
209 
211  alphabet= sf_l->get_alphabet();
212  CAlphabet* ralphabet=sf_r->get_alphabet();
213 
214  if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
215  properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
216 
217  ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet())
218  SG_UNREF(ralphabet);
219 
220  //whenever init is called also init tries and block weights
223 
224  return init_normalizer();
225 }
226 
228 {
229  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
231 
232  SG_FREE(block_weights);
233  block_weights=NULL;
234 
235  tries.destroy();
237 
238  seq_length = 0;
239  tree_initialized = false;
240 
242  alphabet=NULL;
243 
245 }
246 
248  int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
249  int32_t upto_tree)
250 {
253 
254  if (upto_tree<0)
255  upto_tree=tree_num;
256 
257  if (max_mismatch!=0)
258  {
259  SG_ERROR("CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n")
260  return false ;
261  }
262 
263  if (tree_num<0)
264  SG_DEBUG("deleting CWeightedDegreePositionStringKernel optimization\n")
265 
267 
268  if (tree_num<0)
269  SG_DEBUG("initializing CWeightedDegreePositionStringKernel optimization\n")
270 
271  for (int32_t i=0; i<p_count; i++)
272  {
273  if (tree_num<0)
274  {
275  if ( (i % (p_count/10+1)) == 0)
276  SG_PROGRESS(i,0,p_count)
277  add_example_to_tree(IDX[i], alphas[i]);
278  }
279  else
280  {
281  for (int32_t t=tree_num; t<=upto_tree; t++)
282  add_example_to_single_tree(IDX[i], alphas[i], t);
283  }
284  }
285 
286  if (tree_num<0)
287  SG_DONE()
288 
289  set_is_initialized(true) ;
290  return true ;
291 }
292 
294 {
296  {
298  SG_DEBUG("disabling compact trie nodes with FASTBUTMEMHUNGRY\n")
299  }
300 
301  if (get_is_initialized())
302  {
304  tries.delete_trees(true);
305  else if (opt_type==FASTBUTMEMHUNGRY)
306  tries.delete_trees(false); // still buggy
307  else {
308  SG_ERROR("unknown optimization type\n")
309  }
310  set_is_initialized(false);
311 
312  return true;
313  }
314 
315  return false;
316 }
317 
319  char* avec, int32_t alen, char* bvec, int32_t blen)
320 {
321  float64_t* max_shift_vec= SG_MALLOC(float64_t, max_shift);
322  float64_t sum0=0 ;
323  for (int32_t i=0; i<max_shift; i++)
324  max_shift_vec[i]=0 ;
325 
326  // no shift
327  for (int32_t i=0; i<alen; i++)
328  {
329  if ((position_weights!=NULL) && (position_weights[i]==0.0))
330  continue ;
331 
332  int32_t mismatches=0;
333  float64_t sumi = 0.0 ;
334  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
335  {
336  if (avec[i+j]!=bvec[i+j])
337  {
338  mismatches++ ;
339  if (mismatches>max_mismatch)
340  break ;
341  } ;
342  sumi += weights[j+degree*mismatches];
343  }
344  if (position_weights!=NULL)
345  sum0 += position_weights[i]*sumi ;
346  else
347  sum0 += sumi ;
348  } ;
349 
350  for (int32_t i=0; i<alen; i++)
351  {
352  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
353  {
354  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
355  continue ;
356 
357  float64_t sumi1 = 0.0 ;
358  // shift in sequence a
359  int32_t mismatches=0;
360  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
361  {
362  if (avec[i+j+k]!=bvec[i+j])
363  {
364  mismatches++ ;
365  if (mismatches>max_mismatch)
366  break ;
367  } ;
368  sumi1 += weights[j+degree*mismatches];
369  }
370  float64_t sumi2 = 0.0 ;
371  // shift in sequence b
372  mismatches=0;
373  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
374  {
375  if (avec[i+j]!=bvec[i+j+k])
376  {
377  mismatches++ ;
378  if (mismatches>max_mismatch)
379  break ;
380  } ;
381  sumi2 += weights[j+degree*mismatches];
382  }
383  if (position_weights!=NULL)
384  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
385  else
386  max_shift_vec[k-1] += sumi1 + sumi2 ;
387  } ;
388  }
389 
390  float64_t result = sum0 ;
391  for (int32_t i=0; i<max_shift; i++)
392  result += max_shift_vec[i]/(2*(i+1)) ;
393 
394  SG_FREE(max_shift_vec);
395  return result ;
396 }
397 
399  char* avec, int32_t alen, char* bvec, int32_t blen)
400 {
401  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
402  float64_t sum0=0 ;
403  for (int32_t i=0; i<max_shift; i++)
404  max_shift_vec[i]=0 ;
405 
406  // no shift
407  for (int32_t i=0; i<alen; i++)
408  {
409  if ((position_weights!=NULL) && (position_weights[i]==0.0))
410  continue ;
411 
412  float64_t sumi = 0.0 ;
413  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
414  {
415  if (avec[i+j]!=bvec[i+j])
416  break ;
417  sumi += weights[j];
418  }
419  if (position_weights!=NULL)
420  sum0 += position_weights[i]*sumi ;
421  else
422  sum0 += sumi ;
423  } ;
424 
425  for (int32_t i=0; i<alen; i++)
426  {
427  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
428  {
429  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
430  continue ;
431 
432  float64_t sumi1 = 0.0 ;
433  // shift in sequence a
434  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
435  {
436  if (avec[i+j+k]!=bvec[i+j])
437  break ;
438  sumi1 += weights[j];
439  }
440  float64_t sumi2 = 0.0 ;
441  // shift in sequence b
442  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
443  {
444  if (avec[i+j]!=bvec[i+j+k])
445  break ;
446  sumi2 += weights[j];
447  }
448  if (position_weights!=NULL)
449  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
450  else
451  max_shift_vec[k-1] += sumi1 + sumi2 ;
452  } ;
453  }
454 
455  float64_t result = sum0 ;
456  for (int32_t i=0; i<max_shift; i++)
457  result += max_shift_vec[i]/(2*(i+1)) ;
458 
459  SG_FREE(max_shift_vec);
460 
461  return result ;
462 }
463 
465  char* avec, int32_t alen, char* bvec, int32_t blen)
466 {
467  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
468  float64_t sum0=0 ;
469  for (int32_t i=0; i<max_shift; i++)
470  max_shift_vec[i]=0 ;
471 
472  // no shift
473  for (int32_t i=0; i<alen; i++)
474  {
475  if ((position_weights!=NULL) && (position_weights[i]==0.0))
476  continue ;
477  float64_t sumi = 0.0 ;
478  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
479  {
480  if (avec[i+j]!=bvec[i+j])
481  break ;
482  sumi += weights[i*degree+j];
483  }
484  if (position_weights!=NULL)
485  sum0 += position_weights[i]*sumi ;
486  else
487  sum0 += sumi ;
488  } ;
489 
490  for (int32_t i=0; i<alen; i++)
491  {
492  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
493  {
494  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
495  continue ;
496 
497  float64_t sumi1 = 0.0 ;
498  // shift in sequence a
499  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
500  {
501  if (avec[i+j+k]!=bvec[i+j])
502  break ;
503  sumi1 += weights[i*degree+j];
504  }
505  float64_t sumi2 = 0.0 ;
506  // shift in sequence b
507  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
508  {
509  if (avec[i+j]!=bvec[i+j+k])
510  break ;
511  sumi2 += weights[i*degree+j];
512  }
513  if (position_weights!=NULL)
514  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
515  else
516  max_shift_vec[k-1] += sumi1 + sumi2 ;
517  } ;
518  }
519 
520  float64_t result = sum0 ;
521  for (int32_t i=0; i<max_shift; i++)
522  result += max_shift_vec[i]/(2*(i+1)) ;
523 
524  SG_FREE(max_shift_vec);
525  return result ;
526 }
527 
529  char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
530  float64_t* pos_weights_rhs, int32_t blen)
531 {
532  float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
533  float64_t sum0=0 ;
534  for (int32_t i=0; i<max_shift; i++)
535  max_shift_vec[i]=0 ;
536 
537  // no shift
538  for (int32_t i=0; i<alen; i++)
539  {
540  if ((position_weights!=NULL) && (position_weights[i]==0.0))
541  continue ;
542 
543  float64_t sumi = 0.0 ;
544  float64_t posweight_lhs = 0.0 ;
545  float64_t posweight_rhs = 0.0 ;
546  for (int32_t j=0; (j<degree) && (i+j<alen); j++)
547  {
548  posweight_lhs += pos_weights_lhs[i+j] ;
549  posweight_rhs += pos_weights_rhs[i+j] ;
550 
551  if (avec[i+j]!=bvec[i+j])
552  break ;
553  sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
554  }
555  if (position_weights!=NULL)
556  sum0 += position_weights[i]*sumi ;
557  else
558  sum0 += sumi ;
559  } ;
560 
561  for (int32_t i=0; i<alen; i++)
562  {
563  for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
564  {
565  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
566  continue ;
567 
568  // shift in sequence a
569  float64_t sumi1 = 0.0 ;
570  float64_t posweight_lhs = 0.0 ;
571  float64_t posweight_rhs = 0.0 ;
572  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
573  {
574  posweight_lhs += pos_weights_lhs[i+j+k] ;
575  posweight_rhs += pos_weights_rhs[i+j] ;
576  if (avec[i+j+k]!=bvec[i+j])
577  break ;
578  sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
579  }
580  // shift in sequence b
581  float64_t sumi2 = 0.0 ;
582  posweight_lhs = 0.0 ;
583  posweight_rhs = 0.0 ;
584  for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
585  {
586  posweight_lhs += pos_weights_lhs[i+j] ;
587  posweight_rhs += pos_weights_rhs[i+j+k] ;
588  if (avec[i+j]!=bvec[i+j+k])
589  break ;
590  sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
591  }
592  if (position_weights!=NULL)
593  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
594  else
595  max_shift_vec[k-1] += sumi1 + sumi2 ;
596  } ;
597  }
598 
599  float64_t result = sum0 ;
600  for (int32_t i=0; i<max_shift; i++)
601  result += max_shift_vec[i]/(2*(i+1)) ;
602 
603  SG_FREE(max_shift_vec);
604  return result ;
605 }
606 
607 
609  int32_t idx_a, int32_t idx_b)
610 {
611  int32_t alen, blen;
612  bool free_avec, free_bvec;
613 
614  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
615  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
616  // can only deal with strings of same length
617  ASSERT(alen==blen)
618  ASSERT(shift_len==alen)
619 
620  float64_t result = 0 ;
621  if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
622  {
623  ASSERT(max_mismatch==0)
624  float64_t* position_weights_rhs_ = position_weights_rhs ;
625  if (lhs==rhs)
626  position_weights_rhs_ = position_weights_lhs ;
627 
628  result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
629  }
630  else if (max_mismatch > 0)
631  result = compute_with_mismatch(avec, alen, bvec, blen) ;
632  else if (length==0)
633  result = compute_without_mismatch(avec, alen, bvec, blen) ;
634  else
635  result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
636 
637  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
638  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
639 
640  return result ;
641 }
642 
643 
645  int32_t idx, float64_t alpha)
646 {
651 
652  int32_t len=0;
653  bool free_vec;
654  char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
655  ASSERT(max_mismatch==0)
656  int32_t *vec = SG_MALLOC(int32_t, len);
657 
658  for (int32_t i=0; i<len; i++)
659  vec[i]=alphabet->remap_to_bin(char_vec[i]);
660  ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
661 
663  {
664  //TRIES(set_use_compact_terminal_nodes(false)) ;
665  ASSERT(!TRIES(get_use_compact_terminal_nodes()))
666  }
667 
668  for (int32_t i=0; i<len; i++)
669  {
670  int32_t max_s=-1;
671 
673  max_s=0;
674  else if (opt_type==FASTBUTMEMHUNGRY)
675  max_s=shift[i];
676  else {
677  SG_ERROR("unknown optimization type\n")
678  }
679 
680  for (int32_t s=max_s; s>=0; s--)
681  {
682  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
683  TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
684  if ((s==0) || (i+s>=len))
685  continue;
686 
687  TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
688  }
689  }
690 
691  SG_FREE(vec);
692  tree_initialized=true ;
693 }
694 
696  int32_t idx, float64_t alpha, int32_t tree_num)
697 {
702 
703  int32_t len=0;
704  bool free_vec;
705  char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
706  ASSERT(max_mismatch==0)
707  int32_t *vec=SG_MALLOC(int32_t, len);
708  int32_t max_s=-1;
709 
711  max_s=0;
712  else if (opt_type==FASTBUTMEMHUNGRY)
713  {
715  max_s=shift[tree_num];
716  }
717  else {
718  SG_ERROR("unknown optimization type\n")
719  }
720  for (int32_t i=CMath::max(0,tree_num-max_shift);
721  i<CMath::min(len,tree_num+degree+max_shift); i++)
722  {
723  vec[i]=alphabet->remap_to_bin(char_vec[i]);
724  }
725  ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
726 
727  for (int32_t s=max_s; s>=0; s--)
728  {
729  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
730  tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
731  }
732 
734  {
735  for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
736  {
737  int32_t s=tree_num-i;
738  if ((i+s<len) && (s>=1) && (s<=shift[i]))
739  {
740  float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
741  tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
742  }
743  }
744  }
745  SG_FREE(vec);
746  tree_initialized=true ;
747 }
748 
750 {
755 
756  float64_t sum=0;
757  int32_t len=0;
758  bool free_vec;
759  char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
760  ASSERT(max_mismatch==0)
761  int32_t *vec=SG_MALLOC(int32_t, len);
762 
763  for (int32_t i=0; i<len; i++)
764  vec[i]=alphabet->remap_to_bin(char_vec[i]);
765 
766  ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
767 
768  for (int32_t i=0; i<len; i++)
769  sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
770 
772  {
773  for (int32_t i=0; i<len; i++)
774  {
775  for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
776  {
777  sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
778  sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
779  }
780  }
781  }
782 
783  SG_FREE(vec);
784 
785  return normalizer->normalize_rhs(sum, idx);
786 }
787 
789  int32_t idx, float64_t* LevelContrib)
790 {
795 
796  int32_t len=0;
797  bool free_vec;
798  char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
799  ASSERT(max_mismatch==0)
800  int32_t *vec=SG_MALLOC(int32_t, len);
801 
802  for (int32_t i=0; i<len; i++)
803  vec[i]=alphabet->remap_to_bin(char_vec[i]);
804 
805  ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
806 
807  for (int32_t i=0; i<len; i++)
808  {
809  tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
811  (length!=0));
812  }
813 
815  {
816  for (int32_t i=0; i<len; i++)
817  for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
818  {
819  tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
820  normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
821  weights, (length!=0)) ;
822  tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
823  LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
824  mkl_stepsize, weights, (length!=0)) ;
825  }
826  }
827 
828  SG_FREE(vec);
829 }
830 
832  int32_t &len)
833 {
834  return tries.compute_abs_weights(len);
835 }
836 
838 {
839  SG_FREE(shift);
840 
841  shift_len = shifts.vlen;
842  shift = SG_MALLOC(int32_t, shift_len);
843 
844  if (shift)
845  {
846  max_shift = 0 ;
847 
848  for (int32_t i=0; i<shift_len; i++)
849  {
850  shift[i] = shifts.vector[i] ;
852  }
853 
854  ASSERT(max_shift>=0 && max_shift<=shift_len)
855  }
856 }
857 
859 {
860  ASSERT(degree>0)
861 
862  SG_FREE(weights);
863  weights=SG_MALLOC(float64_t, degree);
865  weights_length=1;
866 
867  if (weights)
868  {
869  int32_t i;
870  float64_t sum=0;
871  for (i=0; i<degree; i++)
872  {
873  weights[i]=degree-i;
874  sum+=weights[i];
875  }
876  for (i=0; i<degree; i++)
877  weights[i]/=sum;
878 
879  for (i=0; i<degree; i++)
880  {
881  for (int32_t j=1; j<=max_mismatch; j++)
882  {
883  if (j<i+1)
884  {
885  int32_t nk=CMath::nchoosek(i+1, j);
886  weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
887  }
888  else
889  weights[i+j*degree]= 0;
890  }
891  }
892 
893  return true;
894  }
895  else
896  return false;
897 }
898 
900 {
901  float64_t* ws=new_weights.matrix;
902  int32_t d=new_weights.num_rows;
903  int32_t len=new_weights.num_cols;
904 
905  if (d!=degree || len<0)
906  SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree)
907 
908  degree=d;
909  length=len;
910 
911  if (len <= 0)
912  len=1;
913 
916 
917  SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length)
918  int32_t num_weights=weights_degree*weights_length;
919  SG_FREE(weights);
920  weights=SG_MALLOC(float64_t, num_weights);
921 
922  for (int32_t i=0; i<degree*len; i++)
923  weights[i]=ws[i];
924 
925  return true;
926 }
927 
929 {
930  if (seq_length==0)
931  seq_length=pws.vlen;
932 
933  if (seq_length!=pws.vlen)
934  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, pws.vlen)
935 
936  SG_FREE(position_weights);
937  position_weights=SG_MALLOC(float64_t, pws.vlen);
940 
941  for (int32_t i=0; i<pws.vlen; i++)
942  position_weights[i]=pws.vector[i];
943 }
944 
946 {
949  else
951 
952  if (len==0)
953  {
955  }
956 
957  if (seq_length!=len)
958  {
959  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len)
960  return false;
961  }
962 
963  SG_FREE(position_weights_lhs);
964  position_weights_lhs=SG_MALLOC(float64_t, len*num);
965  position_weights_lhs_len=len*num;
966 
967  for (int32_t i=0; i<len*num; i++)
968  position_weights_lhs[i]=pws[i];
969 
970  return true;
971 }
972 
974  float64_t* pws, int32_t len, int32_t num)
975 {
976  if (len==0)
977  {
979  {
981  return true;
982  }
984  }
985 
986  if (seq_length!=len)
987  {
988  SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len)
989  return false;
990  }
991 
992  SG_FREE(position_weights_rhs);
993  position_weights_rhs=SG_MALLOC(float64_t, len*num);
994  position_weights_rhs_len=len*num;
995 
996  for (int32_t i=0; i<len*num; i++)
997  position_weights_rhs[i]=pws[i];
998 
999  return true;
1000 }
1001 
1003 {
1004  SG_FREE(block_weights);
1006 
1007  if (block_weights)
1008  {
1009  int32_t k;
1010  float64_t d=degree; // use float to evade rounding errors below
1011 
1012  for (k=0; k<degree; k++)
1013  block_weights[k]=
1014  (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
1015  for (k=degree; k<seq_length; k++)
1016  block_weights[k]=(-d+3*k+4)/3;
1017  }
1018 
1019  return (block_weights!=NULL);
1020 }
1021 
1023 {
1024  ASSERT(weights)
1025  SG_FREE(block_weights);
1027 
1028  if (block_weights)
1029  {
1030  int32_t i=0;
1031  block_weights[0]=weights[0];
1032  for (i=1; i<CMath::max(seq_length,degree); i++)
1033  block_weights[i]=0;
1034 
1035  for (i=1; i<CMath::max(seq_length,degree); i++)
1036  {
1037  block_weights[i]=block_weights[i-1];
1038 
1039  float64_t contrib=0;
1040  for (int32_t j=0; j<CMath::min(degree,i+1); j++)
1041  contrib+=weights[j];
1042 
1043  block_weights[i]+=contrib;
1044  }
1045  }
1046 
1047  return (block_weights!=NULL);
1048 }
1049 
1051 {
1052  SG_FREE(block_weights);
1053  block_weights=SG_MALLOC(float64_t, seq_length);
1054 
1055  if (block_weights)
1056  {
1057  for (int32_t i=1; i<seq_length+1 ; i++)
1058  block_weights[i-1]=1.0/seq_length;
1059  }
1060 
1061  return (block_weights!=NULL);
1062 }
1063 
1065 {
1066  SG_FREE(block_weights);
1067  block_weights=SG_MALLOC(float64_t, seq_length);
1068 
1069  if (block_weights)
1070  {
1071  for (int32_t i=1; i<seq_length+1 ; i++)
1072  block_weights[i-1]=degree*i;
1073  }
1074 
1075  return (block_weights!=NULL);
1076 }
1077 
1079 {
1080  SG_FREE(block_weights);
1081  block_weights=SG_MALLOC(float64_t, seq_length);
1082 
1083  if (block_weights)
1084  {
1085  for (int32_t i=1; i<degree+1 ; i++)
1086  block_weights[i-1]=((float64_t) i)*i;
1087 
1088  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1089  block_weights[i-1]=i;
1090  }
1091 
1092  return (block_weights!=NULL);
1093 }
1094 
1096 {
1097  SG_FREE(block_weights);
1098  block_weights=SG_MALLOC(float64_t, seq_length);
1099 
1100  if (block_weights)
1101  {
1102  for (int32_t i=1; i<degree+1 ; i++)
1103  block_weights[i-1]=((float64_t) i)*i*i;
1104 
1105  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1106  block_weights[i-1]=i;
1107  }
1108 
1109  return (block_weights!=NULL);
1110 }
1111 
1113 {
1114  SG_FREE(block_weights);
1115  block_weights=SG_MALLOC(float64_t, seq_length);
1116 
1117  if (block_weights)
1118  {
1119  for (int32_t i=1; i<degree+1 ; i++)
1120  block_weights[i-1]=exp(((float64_t) i/10.0));
1121 
1122  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1123  block_weights[i-1]=i;
1124  }
1125 
1126  return (block_weights!=NULL);
1127 }
1128 
1130 {
1131  SG_FREE(block_weights);
1132  block_weights=SG_MALLOC(float64_t, seq_length);
1133 
1134  if (block_weights)
1135  {
1136  for (int32_t i=1; i<degree+1 ; i++)
1138 
1139  for (int32_t i=degree+1; i<seq_length+1 ; i++)
1140  block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
1141  }
1142 
1143  return (block_weights!=NULL);
1144 }
1145 
1147 {
1148  switch (type)
1149  {
1150  case E_WD:
1151  return init_block_weights_from_wd();
1152  case E_EXTERNAL:
1154  case E_BLOCK_CONST:
1155  return init_block_weights_const();
1156  case E_BLOCK_LINEAR:
1157  return init_block_weights_linear();
1158  case E_BLOCK_SQPOLY:
1159  return init_block_weights_sqpoly();
1160  case E_BLOCK_CUBICPOLY:
1162  case E_BLOCK_EXP:
1163  return init_block_weights_exp();
1164  case E_BLOCK_LOG:
1165  return init_block_weights_log();
1166  };
1167  return false;
1168 }
1169 
1170 
1171 
1173 {
1174  S_THREAD_PARAM_WDS<DNATrie>* params = (S_THREAD_PARAM_WDS<DNATrie>*) p;
1175  int32_t j=params->j;
1176  CWeightedDegreePositionStringKernel* wd=params->kernel;
1177  CTrie<DNATrie>* tries=params->tries;
1178  float64_t* weights=params->weights;
1179  int32_t length=params->length;
1180  int32_t max_shift=params->max_shift;
1181  int32_t* vec=params->vec;
1182  float64_t* result=params->result;
1183  float64_t factor=params->factor;
1184  int32_t* shift=params->shift;
1185  int32_t* vec_idx=params->vec_idx;
1186 
1187  for (int32_t i=params->start; i<params->end; i++)
1188  {
1189  int32_t len=0;
1191  CAlphabet* alpha=wd->alphabet;
1192 
1193  bool free_vec;
1194  char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
1195  for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
1196  vec[k]=alpha->remap_to_bin(char_vec[k]);
1197  rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
1198 
1199  SG_UNREF(rhs_feat);
1200 
1201  result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
1202 
1204  {
1205  for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
1206  {
1207  int32_t s=j-q ;
1208  if ((s>=1) && (s<=shift[q]) && (q+s<len))
1209  {
1210  result[i] +=
1212  len, q, q+s, q, weights, (length!=0)),
1213  vec_idx[i])/(2.0*s);
1214  }
1215  }
1216 
1217  for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
1218  {
1219  result[i] +=
1221  len, j+s, j, j+s, weights, (length!=0)),
1222  vec_idx[i])/(2.0*s);
1223  }
1224  }
1225  }
1226 
1227  return NULL;
1228 }
1229 
1231  int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
1232  int32_t* IDX, float64_t* alphas, float64_t factor)
1233 {
1234  ASSERT(alphabet)
1238  ASSERT(rhs)
1239  ASSERT(num_vec<=rhs->get_num_vectors())
1240  ASSERT(num_vec>0)
1241  ASSERT(vec_idx)
1242  ASSERT(result)
1244 
1245  int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1246  ASSERT(num_feat>0)
1247  int32_t num_threads=parallel->get_num_threads();
1248  ASSERT(num_threads>0)
1249  int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
1250 
1251  if (num_threads < 2)
1252  {
1253 #ifdef WIN32
1254  for (int32_t j=0; j<num_feat; j++)
1255 #else
1257  for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
1258 #endif
1259  {
1260  init_optimization(num_suppvec, IDX, alphas, j);
1261  S_THREAD_PARAM_WDS<DNATrie> params;
1262  params.vec=vec;
1263  params.result=result;
1264  params.weights=weights;
1265  params.kernel=this;
1266  params.tries=&tries;
1267  params.factor=factor;
1268  params.j=j;
1269  params.start=0;
1270  params.end=num_vec;
1271  params.length=length;
1272  params.max_shift=max_shift;
1273  params.shift=shift;
1274  params.vec_idx=vec_idx;
1275  compute_batch_helper((void*) &params);
1276 
1277  SG_PROGRESS(j,0,num_feat)
1278  }
1279  }
1280 #ifdef HAVE_PTHREAD
1281  else
1282  {
1283 
1285  for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
1286  {
1287  init_optimization(num_suppvec, IDX, alphas, j);
1288  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
1289  S_THREAD_PARAM_WDS<DNATrie>* params = SG_MALLOC(S_THREAD_PARAM_WDS<DNATrie>, num_threads);
1290  int32_t step= num_vec/num_threads;
1291  int32_t t;
1292 
1293  for (t=0; t<num_threads-1; t++)
1294  {
1295  params[t].vec=&vec[num_feat*t];
1296  params[t].result=result;
1297  params[t].weights=weights;
1298  params[t].kernel=this;
1299  params[t].tries=&tries;
1300  params[t].factor=factor;
1301  params[t].j=j;
1302  params[t].start = t*step;
1303  params[t].end = (t+1)*step;
1304  params[t].length=length;
1305  params[t].max_shift=max_shift;
1306  params[t].shift=shift;
1307  params[t].vec_idx=vec_idx;
1308  pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)&params[t]);
1309  }
1310 
1311  params[t].vec=&vec[num_feat*t];
1312  params[t].result=result;
1313  params[t].weights=weights;
1314  params[t].kernel=this;
1315  params[t].tries=&tries;
1316  params[t].factor=factor;
1317  params[t].j=j;
1318  params[t].start=t*step;
1319  params[t].end=num_vec;
1320  params[t].length=length;
1321  params[t].max_shift=max_shift;
1322  params[t].shift=shift;
1323  params[t].vec_idx=vec_idx;
1324  compute_batch_helper((void*) &params[t]);
1325 
1326  for (t=0; t<num_threads-1; t++)
1327  pthread_join(threads[t], NULL);
1328  SG_PROGRESS(j,0,num_feat)
1329 
1330  SG_FREE(params);
1331  SG_FREE(threads);
1332  }
1333  }
1334 #endif
1335 
1336  SG_FREE(vec);
1337 
1338  //really also free memory as this can be huge on testing especially when
1339  //using the combined kernel
1341 }
1342 
1344  int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
1345  int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1346 {
1349 
1350  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1351  ASSERT(num_feat>0)
1352  ASSERT(alphabet)
1354 
1355  num_sym=4; //for now works only w/ DNA
1356 
1357  ASSERT(max_degree>0)
1358 
1359  // === variables
1360  int32_t* nofsKmers=SG_MALLOC(int32_t, max_degree);
1361  float64_t** C=SG_MALLOC(float64_t*, max_degree);
1362  float64_t** L=SG_MALLOC(float64_t*, max_degree);
1363  float64_t** R=SG_MALLOC(float64_t*, max_degree);
1364 
1365  int32_t i;
1366  int32_t k;
1367 
1368  // --- return table
1369  int32_t bigtabSize=0;
1370  for (k=0; k<max_degree; ++k )
1371  {
1372  nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
1373  const int32_t tabSize=nofsKmers[k]*num_feat;
1374  bigtabSize+=tabSize;
1375  }
1376  result=SG_MALLOC(float64_t, bigtabSize);
1377 
1378  // --- auxilliary tables
1379  int32_t tabOffs=0;
1380  for( k = 0; k < max_degree; ++k )
1381  {
1382  const int32_t tabSize = nofsKmers[k] * num_feat;
1383  C[k] = &result[tabOffs];
1384  L[k] = SG_MALLOC(float64_t, tabSize );
1385  R[k] = SG_MALLOC(float64_t, tabSize );
1386  tabOffs+=tabSize;
1387  for(i = 0; i < tabSize; i++ )
1388  {
1389  C[k][i] = 0.0;
1390  L[k][i] = 0.0;
1391  R[k][i] = 0.0;
1392  }
1393  }
1394 
1395  // --- tree parsing info
1396  float64_t* margFactors=SG_MALLOC(float64_t, degree);
1397 
1398  int32_t* x = SG_MALLOC(int32_t, degree+1 );
1399  int32_t* substrs = SG_MALLOC(int32_t, degree+1 );
1400  // - fill arrays
1401  margFactors[0] = 1.0;
1402  substrs[0] = 0;
1403  for( k=1; k < degree; ++k ) {
1404  margFactors[k] = 0.25 * margFactors[k-1];
1405  substrs[k] = -1;
1406  }
1407  substrs[degree] = -1;
1408  // - fill struct
1409  struct TreeParseInfo info;
1410  info.num_sym = num_sym;
1411  info.num_feat = num_feat;
1412  info.p = -1;
1413  info.k = -1;
1414  info.nofsKmers = nofsKmers;
1415  info.margFactors = margFactors;
1416  info.x = x;
1417  info.substrs = substrs;
1418  info.y0 = 0;
1419  info.C_k = NULL;
1420  info.L_k = NULL;
1421  info.R_k = NULL;
1422 
1423  // === main loop
1424  i = 0; // total progress
1425  for( k = 0; k < max_degree; ++k )
1426  {
1427  const int32_t nofKmers = nofsKmers[ k ];
1428  info.C_k = C[k];
1429  info.L_k = L[k];
1430  info.R_k = R[k];
1431 
1432  // --- run over all trees
1433  for(int32_t p = 0; p < num_feat; ++p )
1434  {
1435  init_optimization( num_suppvec, IDX, alphas, p );
1436  int32_t tree = p ;
1437  for(int32_t j = 0; j < degree+1; j++ ) {
1438  x[j] = -1;
1439  }
1440  tries.traverse( tree, p, info, 0, x, k );
1441  SG_PROGRESS(i++,0,num_feat*max_degree)
1442  }
1443 
1444  // --- add partial overlap scores
1445  if( k > 0 ) {
1446  const int32_t j = k - 1;
1447  const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
1448  for(int32_t p = 0; p < num_feat; ++p ) {
1449  const int32_t offsetJ = nofJmers * p;
1450  const int32_t offsetJ1 = nofJmers * (p+1);
1451  const int32_t offsetK = nofKmers * p;
1452  int32_t y;
1453  int32_t sym;
1454  for( y = 0; y < nofJmers; ++y ) {
1455  for( sym = 0; sym < num_sym; ++sym ) {
1456  const int32_t y_sym = num_sym*y + sym;
1457  const int32_t sym_y = nofJmers*sym + y;
1458  ASSERT(0<=y_sym && y_sym<nofKmers)
1459  ASSERT(0<=sym_y && sym_y<nofKmers)
1460  C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
1461  if( p < num_feat-1 ) {
1462  C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
1463  }
1464  }
1465  }
1466  }
1467  }
1468  // if( k > 1 )
1469  // j = k-1
1470  // for all positions p
1471  // for all j-mers y
1472  // for n in {A,C,G,T}
1473  // C_k[ p, [y,n] ] += L_j[ p, y ]
1474  // C_k[ p, [n,y] ] += R_j[ p+1, y ]
1475  // end;
1476  // end;
1477  // end;
1478  // end;
1479  }
1480 
1481  // === return a vector
1482  num_feat=1;
1483  num_sym = bigtabSize;
1484  // --- clean up
1485  SG_FREE(nofsKmers);
1486  SG_FREE(margFactors);
1487  SG_FREE(substrs);
1488  SG_FREE(x);
1489  SG_FREE(C);
1490  for( k = 0; k < max_degree; ++k ) {
1491  SG_FREE(L[k]);
1492  SG_FREE(R[k]);
1493  }
1494  SG_FREE(L);
1495  SG_FREE(R);
1496  return result;
1497 }
1498 
1500  int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1501 {
1504  //only works for order <= 32
1505  ASSERT(degree<=32)
1507  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1508  ASSERT(num_feat>0)
1509  ASSERT(alphabet)
1511 
1512  //consensus
1513  char* result=SG_MALLOC(char, num_feat);
1514 
1515  //backtracking and scoring table
1516  int32_t num_tables=CMath::max(1,num_feat-degree+1);
1517  DynArray<ConsensusEntry>** table=SG_MALLOC(DynArray<ConsensusEntry>*, num_tables);
1518 
1519  for (int32_t i=0; i<num_tables; i++)
1520  table[i]=new DynArray<ConsensusEntry>(num_suppvec/10);
1521 
1522  //compute consensus via dynamic programming
1523  for (int32_t i=0; i<num_tables; i++)
1524  {
1525  bool cumulative=false;
1526 
1527  if (i<num_tables-1)
1528  init_optimization(num_suppvec, IDX, alphas, i);
1529  else
1530  {
1531  init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
1532  cumulative=true;
1533  }
1534 
1535  if (i==0)
1536  tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
1537  else
1538  tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
1539 
1540  SG_PROGRESS(i,0,num_feat)
1541  }
1542 
1543 
1544  //int32_t n=table[0]->get_num_elements();
1545 
1546  //for (int32_t i=0; i<n; i++)
1547  //{
1548  // ConsensusEntry e= table[0]->get_element(i);
1549  // SG_PRint32_t("first: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
1550  //}
1551 
1552  //n=table[num_tables-1]->get_num_elements();
1553  //for (int32_t i=0; i<n; i++)
1554  //{
1555  // ConsensusEntry e= table[num_tables-1]->get_element(i);
1556  // SG_PRint32_t("last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
1557  //}
1558  //n=table[num_tables-2]->get_num_elements();
1559  //for (int32_t i=0; i<n; i++)
1560  //{
1561  // ConsensusEntry e= table[num_tables-2]->get_element(i);
1562  // SG_PRINT("second last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt)
1563  //}
1564 
1565  const char* acgt="ACGT";
1566 
1567  //backtracking start
1568  int32_t max_idx=-1;
1569  float32_t max_score=0;
1570  int32_t num_elements=table[num_tables-1]->get_num_elements();
1571 
1572  for (int32_t i=0; i<num_elements; i++)
1573  {
1574  float64_t sc=table[num_tables-1]->get_element(i).score;
1575  if (sc>max_score || max_idx==-1)
1576  {
1577  max_idx=i;
1578  max_score=sc;
1579  }
1580  }
1581  uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
1582 
1583  SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score)
1584 
1585  for (int32_t i=0; i<degree; i++)
1586  result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
1587 
1588  if (num_tables>1)
1589  {
1590  for (int32_t i=num_tables-1; i>=0; i--)
1591  {
1592  //SG_PRINT("max_idx: %d, i:%d\n", max_idx, i)
1593  result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
1594  max_idx=table[i]->get_element(max_idx).bt;
1595  }
1596  }
1597 
1598  //for (int32_t t=0; t<num_tables; t++)
1599  //{
1600  // n=table[t]->get_num_elements();
1601  // for (int32_t i=0; i<n; i++)
1602  // {
1603  // ConsensusEntry e= table[t]->get_element(i);
1604  // SG_PRINT("table[%d,%d]: str:0%0llx sc:%+f bt:%d\n",t,i, e.string,e.score,e.bt)
1605  // }
1606  //}
1607 
1608  for (int32_t i=0; i<num_tables; i++)
1609  delete table[i];
1610 
1611  SG_FREE(table);
1612  return result;
1613 }
1614 
1615 
1617  int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
1618  float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
1619 {
1621  use_poim_tries=true;
1622  poim_tries.delete_trees(false);
1623 
1624  // === check
1627  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1628  ASSERT(num_feat>0)
1630  ASSERT(max_degree>0)
1631 
1632  // === general variables
1633  static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
1634  const int32_t seqLen = num_feat;
1635  float64_t** subs;
1636  int32_t i;
1637  int32_t k;
1638  //int32_t y;
1639 
1640  // === init tables "subs" for substring scores / POIMs
1641  // --- compute table sizes
1642  int32_t* offsets;
1643  int32_t offset;
1644  offsets = SG_MALLOC(int32_t, max_degree );
1645  offset = 0;
1646  for( k = 0; k < max_degree; ++k ) {
1647  offsets[k] = offset;
1648  const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
1649  const int32_t tabSize = nofsKmers * seqLen;
1650  offset += tabSize;
1651  }
1652  // --- allocate memory
1653  const int32_t bigTabSize = offset;
1654  w_result=SG_MALLOC(float64_t, bigTabSize);
1655  for (i=0; i<bigTabSize; ++i)
1656  w_result[i]=0;
1657 
1658  // --- set pointers for tables
1659  subs = SG_MALLOC(float64_t*, max_degree );
1660  ASSERT( subs != NULL )
1661  for( k = 0; k < max_degree; ++k ) {
1662  subs[k] = &w_result[ offsets[k] ];
1663  }
1664  SG_FREE(offsets);
1665 
1666  // === init trees; extract "w"
1667  init_optimization( num_suppvec, IDX, alphas, -1);
1668  poim_tries.POIMs_extract_W( subs, max_degree );
1669 
1670  // === clean; return "subs" as vector
1671  SG_FREE(subs);
1672  num_feat = 1;
1673  num_sym = bigTabSize;
1674  use_poim_tries=false;
1675  poim_tries.delete_trees(false);
1676  return w_result;
1677 }
1678 
1680  int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
1681  float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
1682  float64_t* alphas, float64_t* distrib )
1683 {
1685  use_poim_tries=true;
1686  poim_tries.delete_trees(false);
1687 
1688  // === check
1691  num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
1692  ASSERT(num_feat>0)
1694  ASSERT(max_degree!=0)
1695  ASSERT(distrib)
1696 
1697  // === general variables
1698  static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
1699  const int32_t seqLen = num_feat;
1700  float64_t** subs;
1701  int32_t i;
1702  int32_t k;
1703 
1704  // === DEBUGGING mode
1705  //
1706  // Activated if "max_degree" < 0.
1707  // Allows to output selected partial score.
1708  //
1709  // |max_degree| mod 4
1710  // 0: substring
1711  // 1: superstring
1712  // 2: left overlap
1713  // 3: right overlap
1714  //
1715  const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
1716  if( debug ) {
1717  max_degree = abs(max_degree) / 4;
1718  switch( debug ) {
1719  case 1: {
1720  printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
1721  break;
1722  }
1723  case 2: {
1724  printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
1725  break;
1726  }
1727  case 3: {
1728  printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
1729  break;
1730  }
1731  case 4: {
1732  printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
1733  break;
1734  }
1735  default: {
1736  printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
1737  ASSERT(0)
1738  break;
1739  }
1740  }
1741  }
1742 
1743  // --- compute table sizes
1744  int32_t* offsets;
1745  int32_t offset;
1746  offsets = SG_MALLOC(int32_t, max_degree );
1747  offset = 0;
1748  for( k = 0; k < max_degree; ++k ) {
1749  offsets[k] = offset;
1750  const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
1751  const int32_t tabSize = nofsKmers * seqLen;
1752  offset += tabSize;
1753  }
1754  // --- allocate memory
1755  const int32_t bigTabSize=offset;
1756  poim_result=SG_MALLOC(float64_t, bigTabSize);
1757  for (i=0; i<bigTabSize; ++i )
1758  poim_result[i]=0;
1759 
1760  // --- set pointers for tables
1761  subs=SG_MALLOC(float64_t*, max_degree);
1762  for (k=0; k<max_degree; ++k)
1763  subs[k]=&poim_result[offsets[k]];
1764 
1765  SG_FREE(offsets);
1766 
1767  // === init trees; precalc S, L and R
1768  init_optimization( num_suppvec, IDX, alphas, -1);
1769  poim_tries.POIMs_precalc_SLR( distrib );
1770 
1771  // === compute substring scores
1772  if( debug==0 || debug==1 ) {
1773  poim_tries.POIMs_extract_W( subs, max_degree );
1774  for( k = 1; k < max_degree; ++k ) {
1775  const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
1776  const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
1777  const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
1778  for( i = 0; i < seqLen; ++i ) {
1779  float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
1780  float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
1781  float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
1782  float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
1783  int32_t y0;
1784  for( y0 = 0; y0 < nofKmers0; ++y0 ) {
1785  const int32_t y1l = y0 / NUM_SYMS;
1786  const int32_t y1r = y0 % nofKmers1;
1787  const int32_t y2 = y1r / NUM_SYMS;
1788  subs_k0i[ y0 ] += subs_k1i0[ y1l ];
1789  if( i < seqLen-1 ) {
1790  subs_k0i[ y0 ] += subs_k1i1[ y1r ];
1791  if( k > 1 ) {
1792  subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
1793  }
1794  }
1795  }
1796  }
1797  }
1798  }
1799 
1800  // === compute POIMs
1801  poim_tries.POIMs_add_SLR( subs, max_degree, debug );
1802 
1803  // === clean; return "subs" as vector
1804  SG_FREE(subs);
1805  num_feat = 1;
1806  num_sym = bigTabSize;
1807 
1808  use_poim_tries=false;
1809  poim_tries.delete_trees(false);
1810 
1811  return poim_result;
1812 }
1813 
1814 
1816 {
1817  SG_FREE(m_poim_distrib);
1818  int32_t num_sym=distrib.num_cols;
1819  int32_t num_feat=distrib.num_rows;
1820  m_poim_distrib=SG_MALLOC(float64_t, num_sym*num_feat);
1821  memcpy(m_poim_distrib, distrib.matrix, num_sym*num_feat*sizeof(float64_t));
1822  m_poim_num_sym=num_sym;
1823  m_poim_num_feat=num_feat;
1824 }
1825 
1827  int32_t max_degree, CSVM* svm)
1828 {
1829  ASSERT(svm)
1830  int32_t num_suppvec=svm->get_num_support_vectors();
1831  int32_t* sv_idx=SG_MALLOC(int32_t, num_suppvec);
1832  float64_t* sv_weight=SG_MALLOC(float64_t, num_suppvec);
1833 
1834  for (int32_t i=0; i<num_suppvec; i++)
1835  {
1836  sv_idx[i]=svm->get_support_vector(i);
1837  sv_weight[i]=svm->get_alpha(i);
1838  }
1839 
1840  if ((max_degree < 1) || (max_degree > 12))
1841  {
1842  //SG_WARNING("max_degree out of range 1..12 (%d).\n", max_degree)
1843  SG_WARNING("max_degree out of range 1..12 (%d). setting to 1.\n", max_degree)
1844  max_degree=1;
1845  }
1846 
1847  int32_t num_feat = m_poim_num_feat;
1848  int32_t num_sym = m_poim_num_sym;
1849  SG_FREE(m_poim);
1850 
1851  m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
1852  sv_weight, m_poim_distrib);
1853 
1854  ASSERT(num_feat==1)
1855  m_poim_result_len=num_sym;
1856 
1857  SG_FREE(sv_weight);
1858  SG_FREE(sv_idx);
1859 }
1860 
1862 {
1864  return poim;
1865 }
1866 
1868 {
1869  SG_FREE(m_poim) ;
1870  m_poim=NULL ;
1871  SG_FREE(m_poim_distrib) ;
1872  m_poim_distrib=NULL ;
1873  m_poim_num_sym=0 ;
1874  m_poim_num_sym=0 ;
1875  m_poim_result_len=0 ;
1876 }
1877 
1879 {
1881 
1884 
1885  if (weights)
1887 }
1888 
1889 void CWeightedDegreePositionStringKernel::init()
1890 {
1891  weights=NULL;
1892  weights_length=0;
1893  weights_degree=0;
1894  position_weights=NULL;
1896 
1897  position_weights_lhs=NULL;
1899  position_weights_rhs=NULL;
1901 
1902  weights_buffer=NULL;
1903  mkl_stepsize=1;
1904  degree=1;
1905  length=0;
1906 
1907  max_shift=0;
1908  max_mismatch=0;
1909  seq_length=0;
1910  shift=NULL;
1911  shift_len=0;
1912 
1913  block_weights=NULL;
1914  block_computation=true;
1915  type=E_EXTERNAL;
1916  which_degree=-1;
1917  tries=CTrie<DNATrie>(1);
1919 
1920  tree_initialized=false;
1921  use_poim_tries=false;
1922  m_poim_distrib=NULL;
1923 
1924  m_poim=NULL;
1925  m_poim_num_sym=0;
1926  m_poim_num_feat=0;
1928 
1929  alphabet=NULL;
1930 
1932 
1934 
1936  "weights", "WD Kernel weights.");
1938  "position_weights",
1939  "Weights per position.");
1941  "position_weights_lhs",
1942  "Weights per position left hand side.");
1944  "position_weights_rhs",
1945  "Weights per position right hand side.");
1947  "shift",
1948  "Shift Vector.");
1949  SG_ADD(&max_shift, "max_shift", "Maximal shift.", MS_AVAILABLE);
1950  SG_ADD(&mkl_stepsize, "mkl_stepsize", "MKL step size.", MS_AVAILABLE);
1951  SG_ADD(&degree, "degree", "Order of WD kernel.", MS_AVAILABLE);
1952  SG_ADD(&max_mismatch, "max_mismatch",
1953  "Number of allowed mismatches.", MS_AVAILABLE);
1954  SG_ADD(&block_computation, "block_computation",
1955  "If block computation shall be used.", MS_NOT_AVAILABLE);
1956  SG_ADD((machine_int_t*) &type, "type",
1957  "WeightedDegree kernel type.", MS_AVAILABLE);
1958  SG_ADD(&which_degree, "which_degree",
1959  "The selected degree. All degrees are used by default (for value -1).",
1960  MS_AVAILABLE);
1961  SG_ADD((CSGObject**) &alphabet, "alphabet",
1962  "Alphabet of Features.", MS_NOT_AVAILABLE);
1963 }
RNA - letters A,C,G,U.
Definition: Alphabet.h:32
virtual void load_serializable_post()
Definition: Kernel.cpp:928
virtual int32_t get_max_vector_length()
SGVector< ST > get_feature_vector(int32_t num)
virtual float64_t compute(int32_t idx_a, int32_t idx_b)
void delete_trees(bool p_use_compact_terminal_nodes=true)
Definition: Trie.h:1171
float64_t * compute_scoring(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *weights)
#define SG_INFO(...)
Definition: SGIO.h:118
virtual void cleanup()
Definition: Kernel.cpp:173
T get_element(int32_t index) const
Definition: DynArray.h:142
virtual void compute_batch(int32_t num_vec, int32_t *vec_idx, float64_t *target, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, float64_t factor=1.0)
#define SG_DONE()
Definition: SGIO.h:157
void add_example_to_single_tree(int32_t idx, float64_t weight, int32_t tree_num)
void POIMs_add_SLR(float64_t *const *const poims, const int32_t K, const int32_t debug)
DNA - letters A,C,G,T.
Definition: Alphabet.h:26
int32_t get_num_threads() const
Definition: Parallel.cpp:64
#define SG_PROGRESS(...)
Definition: SGIO.h:142
Template class Trie implements a suffix trie, i.e. a tree in which all suffixes up to a certain lengt...
Definition: Trie.h:136
void POIMs_extract_W(float64_t *const *const W, const int32_t K)
float64_t compute_with_mismatch(char *avec, int32_t alen, char *bvec, int32_t blen)
void traverse(int32_t tree, const int32_t p, struct TreeParseInfo info, const int32_t depth, int32_t *const x, const int32_t k)
Definition: Trie.h:1392
Class ShogunException defines an exception which is thrown whenever an error inside of shogun occurs...
void destroy()
Definition: Trie.h:1135
virtual bool set_normalizer(CKernelNormalizer *normalizer)
Definition: Kernel.cpp:150
EAlphabet get_alphabet() const
Definition: Alphabet.h:130
virtual float64_t normalize_rhs(float64_t value, int32_t idx_rhs)=0
int32_t NUM_SYMS
Definition: Trie.h:633
float64_t * compute_POIM(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *poim_result, int32_t num_suppvec, int32_t *IDX, float64_t *alphas, float64_t *distrib)
CFeatures * get_rhs()
Definition: Kernel.h:510
#define SG_ERROR(...)
Definition: SGIO.h:129
void set_is_initialized(bool p_init)
Definition: Kernel.h:899
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:91
Parameter * m_parameters
Definition: SGObject.h:378
index_t num_cols
Definition: SGMatrix.h:378
int32_t get_num_elements() const
Definition: DynArray.h:130
uint64_t properties
Definition: Kernel.h:1082
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Parallel * parallel
Definition: SGObject.h:372
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:159
float64_t compute_by_tree_helper(int32_t *vec, int32_t len, int32_t seq_pos, int32_t tree_pos, int32_t weight_pos, float64_t *weights, bool degree_times_position_weights)
Definition: Trie.h:1722
void POIMs_precalc_SLR(const float64_t *const distrib)
index_t num_rows
Definition: SGMatrix.h:376
bool get_is_initialized()
Definition: Kernel.h:753
void fill_backtracking_table(int32_t pos, DynArray< ConsensusEntry > *prev, DynArray< ConsensusEntry > *cur, bool cumulative, float64_t *weights)
Definition: Trie.h:2087
void set_position_weights(float64_t *p_position_weights)
Definition: Trie.h:485
index_t vlen
Definition: SGVector.h:494
virtual bool init_optimization(int32_t p_count, int32_t *IDX, float64_t *alphas)
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
void set_use_compact_terminal_nodes(bool p_use_compact_terminal_nodes)
Definition: Trie.h:466
static void clear_cancel()
Definition: Signal.cpp:129
virtual void add_example_to_tree(int32_t idx, float64_t weight)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:32
double float64_t
Definition: common.h:50
void create(int32_t len, bool p_use_compact_terminal_nodes=true)
Definition: Trie.h:1156
EOptimizationType get_optimization_type()
Definition: Kernel.h:741
float64_t compute_without_mismatch_matrix(char *avec, int32_t alen, char *bvec, int32_t blen)
virtual void set_position_weights(SGVector< float64_t > pws)
float64_t get_alpha(int32_t idx)
static T max(T a, T b)
Definition: Math.h:168
int32_t get_support_vector(int32_t idx)
static bool cancel_computations()
Definition: Signal.h:86
virtual bool init_normalizer()
Definition: Kernel.cpp:168
float float32_t
Definition: common.h:49
bool have_same_length(int32_t len=-1)
EOptimizationType opt_type
Definition: Kernel.h:1079
CFeatures * rhs
feature vectors to occur on right hand side
Definition: Kernel.h:1061
static int64_t nchoosek(int32_t n, int32_t k)
Definition: Math.h:1191
#define SG_UNREF(x)
Definition: SGObject.h:52
bool set_position_weights_lhs(float64_t *pws, int32_t len, int32_t num)
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:334
#define SG_DEBUG(...)
Definition: SGIO.h:107
char * compute_consensus(int32_t &num_feat, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
float64_t * compute_abs_weights(int32_t &len)
Definition: Trie.h:1214
void add_to_trie(int32_t i, int32_t seq_offset, int32_t *vec, float32_t alpha, float64_t *weights, bool degree_times_position_weights)
Definition: Trie.h:1496
int machine_int_t
Definition: common.h:59
CFeatures * lhs
feature vectors to occur on left hand side
Definition: Kernel.h:1059
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
static float64_t log(float64_t v)
Definition: Math.h:922
virtual void remove_lhs()
Definition: Kernel.cpp:679
A generic Support Vector Machine Interface.
Definition: SVM.h:49
bool get_use_compact_terminal_nodes()
Definition: Trie.h:456
CKernelNormalizer * normalizer
Definition: Kernel.h:1086
bool set_position_weights_rhs(float64_t *pws, int32_t len, int32_t num)
#define SG_WARNING(...)
Definition: SGIO.h:128
The Weighted Degree Position String kernel (Weighted Degree kernel with shifts).
#define SG_ADD(...)
Definition: SGObject.h:81
friend class CSqrtDiagKernelNormalizer
Definition: Kernel.h:161
float64_t * extract_w(int32_t max_degree, int32_t &num_feat, int32_t &num_sym, float64_t *w_result, int32_t num_suppvec, int32_t *IDX, float64_t *alphas)
virtual float64_t normalize_lhs(float64_t value, int32_t idx_lhs)=0
void add_matrix(bool **param, index_t *length_y, index_t *length_x, const char *name, const char *description="")
Definition: Parameter.cpp:943
Template class StringKernel, is the base class of all String Kernels.
Definition: StringKernel.h:26
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
float64_t compute_without_mismatch(char *avec, int32_t alen, char *bvec, int32_t blen)
float64_t compute_without_mismatch_position_weights(char *avec, float64_t *posweights_lhs, int32_t alen, char *bvec, float64_t *posweights_rhs, int32_t blen)
virtual int32_t get_vector_length(int32_t vec_num)

SHOGUN Machine Learning Toolbox - Documentation