SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Trie.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2009 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #ifndef _TRIE_H___
13 #define _TRIE_H___
14 
15 #include <string.h>
16 #include <shogun/lib/common.h>
17 #include <shogun/io/SGIO.h>
18 #include <shogun/base/DynArray.h>
20 #include <shogun/base/SGObject.h>
21 
22 namespace shogun
23 {
24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
25 
26 // sentinel is 0xFFFFFFFC or float -2
27 #define NO_CHILD ((int32_t)-1073741824)
28 
29 #define WEIGHTS_IN_TRIE
30 //#define TRIE_CHECK_EVERYTHING
31 
32 #ifdef TRIE_CHECK_EVERYTHING
33 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
34 #else
35 #define TRIE_ASSERT_EVERYTHING(x)
36 #endif
37 
38 //#define TRIE_ASSERT(x) ASSERT(x)
39 #define TRIE_ASSERT(x)
40 
41 #define TRIE_TERMINAL_CHARACTER 7
42 
44 struct ConsensusEntry
45 {
47  uint64_t string;
49  float32_t score;
51  int32_t bt;
52 };
53 
55 struct POIMTrie
56 {
58  float64_t weight;
59 #ifdef TRIE_CHECK_EVERYTHING
60 
61  bool has_seq;
63  bool has_floats;
64 #endif
65  union
66  {
68  float32_t child_weights[4];
70  int32_t children[4];
72  uint8_t seq[16] ;
73  };
74 
76  float64_t S;
78  float64_t L;
80  float64_t R;
81 };
82 
84 struct DNATrie
85 {
87  float64_t weight;
88 #ifdef TRIE_CHECK_EVERYTHING
89 
90  bool has_seq;
92  bool has_floats;
93 #endif
94  union
95  {
97  float32_t child_weights[4];
99  int32_t children[4];
101  uint8_t seq[16] ;
102  };
103 };
104 
106 struct TreeParseInfo {
108  int32_t num_sym;
110  int32_t num_feat;
112  int32_t p;
114  int32_t k;
116  int32_t* nofsKmers;
118  float64_t* margFactors;
120  int32_t* x;
122  int32_t* substrs;
124  int32_t y0;
126  float64_t* C_k;
128  float64_t* L_k;
130  float64_t* R_k;
131 };
132 
133 #endif // DOXYGEN_SHOULD_SKIP_THIS
134 
135 template <class Trie> class CTrie;
136 
137 #define IGNORE_IN_CLASSLIST
138 
156 IGNORE_IN_CLASSLIST template <class Trie> class CTrie : public CSGObject
157 {
158  public:
160  CTrie();
161 
168  CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
169 
171  CTrie(const CTrie & to_copy);
172  virtual ~CTrie();
173 
175  const CTrie & operator=(const CTrie & to_copy);
176 
184  bool compare_traverse(
185  int32_t node, const CTrie & other, int32_t other_node);
186 
192  bool compare(const CTrie & other);
193 
200  bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
201 
208  int32_t find_deepest_node(
209  int32_t start_node, int32_t &deepest_node) const;
210 
215  void display_node(int32_t node) const;
216 
218  void destroy();
219 
224  void set_degree(int32_t d);
225 
232  void create(int32_t len, bool p_use_compact_terminal_nodes=true);
233 
239  void delete_trees(bool p_use_compact_terminal_nodes=true);
240 
251  void add_to_trie(
252  int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
253  float64_t *weights, bool degree_times_position_weights);
254 
261  float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
262 
268  float64_t* compute_abs_weights(int32_t &len);
269 
283  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
284  int32_t weight_pos, float64_t * weights,
285  bool degree_times_position_weights) ;
286 
302  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
303  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
304  int32_t mkl_stepsize, float64_t * weights,
305  bool degree_times_position_weights);
306 
322  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
323  int32_t max_degree, int32_t num_feat, int32_t num_sym,
324  int32_t sym_offset, int32_t offs, float64_t* result);
325 
339  int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
340  int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
341  int32_t max_mismatch, float64_t * weights);
342 
352  void traverse(
353  int32_t tree, const int32_t p, struct TreeParseInfo info,
354  const int32_t depth, int32_t* const x, const int32_t k);
355 
365  void count(
366  const float64_t w, const int32_t depth,
367  const struct TreeParseInfo info, const int32_t p, int32_t* x,
368  const int32_t k);
369 
376  int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
377 
387  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
388 
399  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
400  DynArray<ConsensusEntry>* table, float64_t* weights);
401 
411  int32_t pos, DynArray<ConsensusEntry>* prev,
412  DynArray<ConsensusEntry>* cur, bool cumulative,
413  float64_t* weights);
414 
420  void POIMs_extract_W(float64_t* const* const W, const int32_t K);
421 
426  void POIMs_precalc_SLR(const float64_t* const distrib);
427 
437  void POIMs_get_SLR(
438  const int32_t parentIdx, const int32_t sym, const int32_t depth,
439  float64_t* S, float64_t* L, float64_t* R);
440 
447  void POIMs_add_SLR(
448  float64_t* const* const poims, const int32_t K,
449  const int32_t debug);
450 
456  {
458  }
459 
466  bool p_use_compact_terminal_nodes)
467  {
468  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
469  }
470 
475  inline int32_t get_num_used_nodes()
476  {
477  return TreeMemPtr;
478  }
479 
484  inline void set_position_weights(float64_t* p_position_weights)
485  {
486  position_weights=p_position_weights;
487  }
488 
493  inline int32_t get_node(bool last_node=false)
494  {
495  int32_t ret = TreeMemPtr++;
496  check_treemem() ;
497 
498  if (last_node)
499  {
500  for (int32_t q=0; q<4; q++)
501  TreeMem[ret].child_weights[q]=0.0;
502  }
503  else
504  {
505  for (int32_t q=0; q<4; q++)
506  TreeMem[ret].children[q]=NO_CHILD;
507  }
508 #ifdef TRIE_CHECK_EVERYTHING
509  TreeMem[ret].has_seq=false ;
510  TreeMem[ret].has_floats=false ;
511 #endif
512  TreeMem[ret].weight=0.0;
513  return ret ;
514  }
515 
517  inline void check_treemem()
518  {
519  if (TreeMemPtr+10 < TreeMemPtrMax)
520  return;
521  SG_DEBUG( "Extending TreeMem from %i to %i elements\n",
522  TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
523  TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
524  TreeMem = SG_REALLOC(Trie, TreeMem, TreeMemPtrMax);
525  }
526 
531  inline void set_weights_in_tree(bool weights_in_tree_)
532  {
533  weights_in_tree = weights_in_tree_;
534  }
535 
540  inline bool get_weights_in_tree()
541  {
542  return weights_in_tree;
543  }
544 
555  const int32_t nodeIdx, const int32_t depth, const int32_t offset,
556  const int32_t y0, float64_t* const* const W, const int32_t K);
557 
571  const float64_t* const distrib, const int32_t i,
572  const int32_t nodeIdx, int32_t left_tries_idx[4],
573  const int32_t depth, int32_t const lastSym, float64_t* S,
574  float64_t* L, float64_t* R);
575 
576 
588  const float64_t* const distrib, const int32_t i,
589  const int32_t nodeIdx, int32_t left_tries_idx[4],
590  const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
591 
603  const int32_t nodeIdx, const int32_t depth,const int32_t i,
604  const int32_t y0, float64_t* const* const poims, const int32_t K,
605  const int32_t debug);
606 
621  float64_t* const* const poims, const int32_t K, const int32_t k,
622  const int32_t i, const int32_t y, const float64_t valW,
623  const float64_t valS, const float64_t valL, const float64_t valR,
624  const int32_t debug);
625 
627  inline virtual const char* get_name() const { return "Trie"; }
628 
629  public:
631  int32_t NUM_SYMS;
632 
633  protected:
635  int32_t length;
637  int32_t * trees;
638 
640  int32_t degree;
643 
645  Trie* TreeMem;
647  int32_t TreeMemPtr;
649  int32_t TreeMemPtrMax;
652 
655 
657  int32_t* nofsKmers;
658 };
659  template <class Trie>
661  : CSGObject(), degree(0), position_weights(NULL),
662  use_compact_terminal_nodes(false),
663  weights_in_tree(true)
664  {
665 
666  TreeMemPtrMax=0;
667  TreeMemPtr=0;
668  TreeMem=NULL;
669 
670  length=0;
671  trees=NULL;
672 
673  NUM_SYMS=4;
674  }
675 
676  template <class Trie>
677  CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
678  : CSGObject(), degree(d), position_weights(NULL),
679  use_compact_terminal_nodes(p_use_compact_terminal_nodes),
680  weights_in_tree(true)
681  {
682  TreeMemPtrMax=1024*1024/sizeof(Trie);
683  TreeMemPtr=0;
685 
686  length=0;
687  trees=NULL;
688 
689  NUM_SYMS=4;
690  }
691 
692  template <class Trie>
693  CTrie<Trie>::CTrie(const CTrie & to_copy)
694  : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
695  use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
696  {
697  if (to_copy.position_weights!=NULL)
698  {
700  /*SG_MALLOC(float64_t, to_copy.length);
701  for (int32_t i=0; i<to_copy.length; i++)
702  position_weights[i]=to_copy.position_weights[i]; */
703  }
704  else
705  position_weights=NULL;
706 
708  TreeMemPtr=to_copy.TreeMemPtr;
710  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
711 
712  length=to_copy.length;
713  trees=SG_MALLOC(int32_t, length);
714  for (int32_t i=0; i<length; i++)
715  trees[i]=to_copy.trees[i];
716 
717  NUM_SYMS=4;
718  }
719 
720  template <class Trie>
722 {
723  degree=to_copy.degree ;
724  use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
725 
726  SG_FREE(position_weights);
727  position_weights=NULL ;
728  if (to_copy.position_weights!=NULL)
729  {
730  position_weights=to_copy.position_weights ;
731  /*position_weights = SG_MALLOC(float64_t, to_copy.length);
732  for (int32_t i=0; i<to_copy.length; i++)
733  position_weights[i]=to_copy.position_weights[i] ;*/
734  }
735  else
736  position_weights=NULL ;
737 
738  TreeMemPtrMax=to_copy.TreeMemPtrMax ;
739  TreeMemPtr=to_copy.TreeMemPtr ;
740  SG_FREE(TreeMem) ;
741  TreeMem = SG_MALLOC(Trie, TreeMemPtrMax);
742  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
743 
744  length = to_copy.length ;
745  if (trees)
746  SG_FREE(trees);
747  trees=SG_MALLOC(int32_t, length);
748  for (int32_t i=0; i<length; i++)
749  trees[i]=to_copy.trees[i] ;
750 
751  return *this ;
752 }
753 
754 template <class Trie>
756  int32_t start_node, int32_t& deepest_node) const
757 {
758 #ifdef TRIE_CHECK_EVERYTHING
759  int32_t ret=0 ;
760  SG_DEBUG("start_node=%i\n", start_node) ;
761 
762  if (start_node==NO_CHILD)
763  {
764  for (int32_t i=0; i<length; i++)
765  {
766  int32_t my_deepest_node ;
767  int32_t depth=find_deepest_node(i, my_deepest_node) ;
768  SG_DEBUG("start_node %i depth=%i\n", i, depth) ;
769  if (depth>ret)
770  {
771  deepest_node=my_deepest_node ;
772  ret=depth ;
773  }
774  }
775  return ret ;
776  }
777 
778  if (TreeMem[start_node].has_seq)
779  {
780  for (int32_t q=0; q<16; q++)
781  if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
782  ret++ ;
783  deepest_node=start_node ;
784  return ret ;
785  }
786  if (TreeMem[start_node].has_floats)
787  {
788  deepest_node=start_node ;
789  return 1 ;
790  }
791 
792  for (int32_t q=0; q<4; q++)
793  {
794  int32_t my_deepest_node ;
795  if (TreeMem[start_node].children[q]==NO_CHILD)
796  continue ;
797  int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
798  if (depth>ret)
799  {
800  deepest_node=my_deepest_node ;
801  ret=depth ;
802  }
803  }
804  return ret ;
805 #else
806  SG_ERROR( "not implemented\n") ;
807  return 0 ;
808 #endif
809 }
810 
811  template <class Trie>
813  int32_t start_node, int32_t depth, float64_t * weights)
814 {
815  SG_ERROR( "code buggy\n") ;
816 
817  int32_t ret=0 ;
818 
819  if (start_node==NO_CHILD)
820  {
821  for (int32_t i=0; i<length; i++)
822  compact_nodes(i,1, weights) ;
823  return 0 ;
824  }
825  if (start_node<0)
826  return -1 ;
827 
828  if (depth==degree-1)
829  {
830  TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats) ;
831  int32_t num_used=0 ;
832  for (int32_t q=0; q<4; q++)
833  if (TreeMem[start_node].child_weights[q]!=0.0)
834  num_used++ ;
835  if (num_used>1)
836  return -1 ;
837  return 1 ;
838  }
839  TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats) ;
840 
841  int32_t num_used = 0 ;
842  int32_t q_used=-1 ;
843 
844  for (int32_t q=0; q<4; q++)
845  {
846  if (TreeMem[start_node].children[q]==NO_CHILD)
847  continue ;
848  num_used++ ;
849  q_used=q ;
850  }
851  if (num_used>1)
852  {
853  if (depth>=degree-2)
854  return -1 ;
855  for (int32_t q=0; q<4; q++)
856  {
857  if (TreeMem[start_node].children[q]==NO_CHILD)
858  continue ;
859  int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
860  if (num<=2)
861  continue ;
862  int32_t node=get_node() ;
863 
864  int32_t last_node=TreeMem[start_node].children[q] ;
865  if (weights_in_tree)
866  {
867  ASSERT(weights[depth]!=0.0) ;
868  TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
869  }
870  else
871  TreeMem[node].weight=TreeMem[last_node].weight ;
872 
873 #ifdef TRIE_CHECK_EVERYTHING
874  TreeMem[node].has_seq=true ;
875 #endif
876  memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
877  for (int32_t n=0; n<num; n++)
878  {
879  ASSERT(depth+n+1<=degree-1) ;
880  ASSERT(last_node!=NO_CHILD) ;
881  if (depth+n+1==degree-1)
882  {
883  TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats) ;
884  int32_t k ;
885  for (k=0; k<4; k++)
886  if (TreeMem[last_node].child_weights[k]!=0.0)
887  break ;
888  if (k==4)
889  break ;
890  TreeMem[node].seq[n]=k ;
891  break ;
892  }
893  else
894  {
895  TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats) ;
896  int32_t k ;
897  for (k=0; k<4; k++)
898  if (TreeMem[last_node].children[k]!=NO_CHILD)
899  break ;
900  if (k==4)
901  break ;
902  TreeMem[node].seq[n]=k ;
903  last_node=TreeMem[last_node].children[k] ;
904  }
905  }
906  TreeMem[start_node].children[q]=-node ;
907  }
908  return -1 ;
909  }
910  if (num_used==0)
911  return 0 ;
912 
913  ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
914  if (ret<0)
915  return ret ;
916  return ret+1 ;
917 }
918 
919 
920  template <class Trie>
922  int32_t node, const CTrie<Trie> & other, int32_t other_node)
923 {
924  SG_DEBUG("checking nodes %i and %i\n", node, other_node) ;
925  if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
926  {
927  SG_DEBUG( "CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight) ;
928  SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
929  display_node(node) ;
930  SG_DEBUG( "============================================================\n") ;
931  other.display_node(other_node) ;
932  SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
933  return false ;
934  }
935 
936 #ifdef TRIE_CHECK_EVERYTHING
937  if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
938  {
939  SG_DEBUG( "CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq) ;
940  SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
941  display_node(node) ;
942  SG_DEBUG( "============================================================\n") ;
943  other.display_node(other_node) ;
944  SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
945  return false ;
946  }
947  if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
948  {
949  SG_DEBUG( "CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats) ;
950  return false ;
951  }
952  if (other.TreeMem[other_node].has_floats)
953  {
954  for (int32_t q=0; q<4; q++)
955  if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
956  {
957  SG_DEBUG( "CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q]) ;
958  SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
959  display_node(node) ;
960  SG_DEBUG( "============================================================\n") ;
961  other.display_node(other_node) ;
962  SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
963  return false ;
964  }
965  }
966  if (other.TreeMem[other_node].has_seq)
967  {
968  for (int32_t q=0; q<16; q++)
969  if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
970  {
971  SG_DEBUG( "CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q]) ;
972  SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
973  display_node(node) ;
974  SG_DEBUG( "============================================================\n") ;
975  other.display_node(other_node) ;
976  SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
977  return false ;
978  }
979  }
980  if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
981  {
982  for (int32_t q=0; q<4; q++)
983  {
984  if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
985  continue ;
986  if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
987  {
988  SG_DEBUG( "CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q]) ;
989  SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
990  display_node(node) ;
991  SG_DEBUG( "============================================================\n") ;
992  other.display_node(other_node) ;
993  SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
994  return false ;
995  }
996  if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
997  return false ;
998  }
999  }
1000 #else
1001  SG_ERROR( "not implemented\n") ;
1002 #endif
1003 
1004  return true ;
1005 }
1006 
1007  template <class Trie>
1009 {
1010  bool ret=true ;
1011  for (int32_t i=0; i<length; i++)
1012  if (!compare_traverse(trees[i], other, other.trees[i]))
1013  return false ;
1014  else
1015  SG_DEBUG("two tries at %i identical\n", i) ;
1016 
1017  return ret ;
1018 }
1019 
1020 template <class Trie>
1022  int32_t node, int32_t * trace, int32_t& trace_len) const
1023 {
1024 #ifdef TRIE_CHECK_EVERYTHING
1025  ASSERT(trace_len-1>=0) ;
1026  ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
1027  if (TreeMem[trace[trace_len-1]].has_seq)
1028  return false ;
1029  if (TreeMem[trace[trace_len-1]].has_floats)
1030  return false ;
1031 
1032  for (int32_t q=0; q<4; q++)
1033  {
1034  if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
1035  continue ;
1036  int32_t tl=trace_len+1 ;
1037  if (TreeMem[trace[trace_len-1]].children[q]>=0)
1038  trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
1039  else
1040  trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
1041 
1042  if (trace[trace_len]==node)
1043  {
1044  trace_len=tl ;
1045  return true ;
1046  }
1047  if (find_node(node, trace, tl))
1048  {
1049  trace_len=tl ;
1050  return true ;
1051  }
1052  }
1053  trace_len=0 ;
1054  return false ;
1055 #else
1056  SG_ERROR( "not implemented\n") ;
1057  return false ;
1058 #endif
1059 }
1060 
1061 template <class Trie>
1062 void CTrie<Trie>::display_node(int32_t node) const
1063 {
1064 #ifdef TRIE_CHECK_EVERYTHING
1065  int32_t * trace=SG_MALLOC(int32_t, 2*degree);
1066  int32_t trace_len=-1 ;
1067  bool found = false ;
1068  int32_t tree=-1 ;
1069  for (tree=0; tree<length; tree++)
1070  {
1071  trace[0]=trees[tree] ;
1072  trace_len=1 ;
1073  found=find_node(node, trace, trace_len) ;
1074  if (found)
1075  break ;
1076  }
1077  ASSERT(found) ;
1078  SG_PRINT( "position %i trace: ", tree) ;
1079 
1080  for (int32_t i=0; i<trace_len-1; i++)
1081  {
1082  int32_t branch=-1 ;
1083  for (int32_t q=0; q<4; q++)
1084  if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
1085  {
1086  branch=q;
1087  break ;
1088  }
1089  ASSERT(branch!=-1) ;
1090  char acgt[5]="ACGT" ;
1091  SG_PRINT( "%c", acgt[branch]) ;
1092  }
1093  SG_PRINT( "\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats) ;
1094  if (TreeMem[node].has_floats)
1095  {
1096  for (int32_t q=0; q<4; q++)
1097  SG_PRINT( "child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q]) ;
1098  }
1099  if (TreeMem[node].has_seq)
1100  {
1101  for (int32_t q=0; q<16; q++)
1102  SG_PRINT( "seq[%i] = %i\n", q, TreeMem[node].seq[q]) ;
1103  }
1104  if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
1105  {
1106  for (int32_t q=0; q<4; q++)
1107  {
1108  if (TreeMem[node].children[q]!=NO_CHILD)
1109  {
1110  SG_PRINT( "children[%i] = %i -> \n", q, TreeMem[node].children[q]) ;
1111  display_node(abs(TreeMem[node].children[q])) ;
1112  }
1113  else
1114  SG_PRINT( "children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q]) ;
1115  }
1116 
1117  }
1118 
1119  SG_FREE(trace);
1120 #else
1121  SG_ERROR( "not implemented\n") ;
1122 #endif
1123 }
1124 
1125 
1126 template <class Trie> CTrie<Trie>::~CTrie()
1127 {
1128  destroy() ;
1129 
1130  SG_FREE(TreeMem) ;
1131 }
1132 
1133 template <class Trie> void CTrie<Trie>::destroy()
1134 {
1135  if (trees!=NULL)
1136  {
1137  delete_trees();
1138  for (int32_t i=0; i<length; i++)
1139  trees[i] = NO_CHILD;
1140  SG_FREE(trees);
1141 
1142  TreeMemPtr=0;
1143  length=0;
1144  trees=NULL;
1145  }
1146 }
1147 
1148 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
1149 {
1150  delete_trees(get_use_compact_terminal_nodes());
1151  degree=d;
1152 }
1153 
1154 template <class Trie> void CTrie<Trie>::create(
1155  int32_t len, bool p_use_compact_terminal_nodes)
1156 {
1157  destroy();
1158 
1159  trees=SG_MALLOC(int32_t, len);
1160  TreeMemPtr=0 ;
1161  for (int32_t i=0; i<len; i++)
1162  trees[i]=get_node(degree==1);
1163  length = len ;
1164 
1165  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1166 }
1167 
1168 
1169 template <class Trie> void CTrie<Trie>::delete_trees(
1170  bool p_use_compact_terminal_nodes)
1171 {
1172  if (trees==NULL)
1173  return;
1174 
1175  TreeMemPtr=0 ;
1176  for (int32_t i=0; i<length; i++)
1177  trees[i]=get_node(degree==1);
1178 
1179  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1180 }
1181 
1182  template <class Trie>
1184 {
1185  float64_t ret=0 ;
1186 
1187  if (tree==NO_CHILD)
1188  return 0 ;
1189  TRIE_ASSERT(tree>=0) ;
1190 
1191  if (depth==degree-2)
1192  {
1193  ret+=(TreeMem[tree].weight) ;
1194 
1195  for (int32_t k=0; k<4; k++)
1196  ret+=(TreeMem[tree].child_weights[k]) ;
1197 
1198  return ret ;
1199  }
1200 
1201  ret+=(TreeMem[tree].weight) ;
1202 
1203  for (int32_t i=0; i<4; i++)
1204  if (TreeMem[tree].children[i]!=NO_CHILD)
1205  ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
1206 
1207  return ret ;
1208 }
1209 
1210 
1211  template <class Trie>
1213 {
1214  float64_t * sum=SG_MALLOC(float64_t, length*4);
1215  for (int32_t i=0; i<length*4; i++)
1216  sum[i]=0 ;
1217  len=length ;
1218 
1219  for (int32_t i=0; i<length; i++)
1220  {
1221  TRIE_ASSERT(trees[i]!=NO_CHILD) ;
1222  for (int32_t k=0; k<4; k++)
1223  {
1224  sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
1225  }
1226  }
1227 
1228  return sum ;
1229 }
1230 
1231  template <class Trie>
1233  int32_t tree, int32_t i, float64_t alpha,
1234  int32_t *vec, int32_t len_rem,
1235  int32_t degree_rec, int32_t mismatch_rec,
1236  int32_t max_mismatch, float64_t * weights)
1237 {
1238  if (tree==NO_CHILD)
1239  tree=trees[i] ;
1240  TRIE_ASSERT(tree!=NO_CHILD) ;
1241 
1242  if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
1243  return ;
1244  const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
1245 
1246  int32_t subtree = NO_CHILD ;
1247 
1248  if (degree_rec==degree-1)
1249  {
1250  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
1251  if (weights_in_tree)
1252  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
1253  else
1254  if (weights[degree_rec]!=0.0)
1255  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1256  if (mismatch_rec+1<=max_mismatch)
1257  for (int32_t o=0; o<3; o++)
1258  {
1259  if (weights_in_tree)
1260  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1261  else
1262  if (weights[degree_rec]!=0.0)
1263  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1264  }
1265  return ;
1266  }
1267  else
1268  {
1269  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1270  if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
1271  {
1272  subtree=TreeMem[tree].children[vec[0]] ;
1273  if (weights_in_tree)
1274  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
1275  else
1276  if (weights[degree_rec]!=0.0)
1277  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1278  }
1279  else
1280  {
1281  int32_t tmp = get_node(degree_rec==degree-2);
1282  ASSERT(tmp>=0) ;
1283  TreeMem[tree].children[vec[0]]=tmp ;
1284  subtree=tmp ;
1285 #ifdef TRIE_CHECK_EVERYTHING
1286  if (degree_rec==degree-2)
1287  TreeMem[subtree].has_floats=true ;
1288 #endif
1289  if (weights_in_tree)
1290  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
1291  else
1292  if (weights[degree_rec]!=0.0)
1293  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
1294  else
1295  TreeMem[subtree].weight = 0.0 ;
1296  }
1297  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1298  &vec[1], len_rem-1,
1299  degree_rec+1, mismatch_rec, max_mismatch, weights) ;
1300 
1301  if (mismatch_rec+1<=max_mismatch)
1302  {
1303  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1304  for (int32_t o=0; o<3; o++)
1305  {
1306  int32_t ot = other[vec[0]][o] ;
1307  if (TreeMem[tree].children[ot]!=NO_CHILD)
1308  {
1309  subtree=TreeMem[tree].children[ot] ;
1310  if (weights_in_tree)
1311  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1312  else
1313  if (weights[degree_rec]!=0.0)
1314  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1315  }
1316  else
1317  {
1318  int32_t tmp = get_node(degree_rec==degree-2);
1319  ASSERT(tmp>=0) ;
1320  TreeMem[tree].children[ot]=tmp ;
1321  subtree=tmp ;
1322 #ifdef TRIE_CHECK_EVERYTHING
1323  if (degree_rec==degree-2)
1324  TreeMem[subtree].has_floats=true ;
1325 #endif
1326 
1327  if (weights_in_tree)
1328  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
1329  else
1330  if (weights[degree_rec]!=0.0)
1331  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
1332  else
1333  TreeMem[subtree].weight = 0.0 ;
1334  }
1335 
1336  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1337  &vec[1], len_rem-1,
1338  degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
1339  }
1340  }
1341  }
1342 }
1343 
1344  template <class Trie>
1346  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
1347  int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
1348  int32_t offs, float64_t* result)
1349 {
1350  if (i+j<num_feat)
1351  {
1352  float64_t decay=1.0; //no decay by default
1353  //if (j>d)
1354  // decay=pow(0.5,j); //marginalize out lower order matches
1355 
1356  if (j<degree-1)
1357  {
1358  for (int32_t k=0; k<num_sym; k++)
1359  {
1360  if (TreeMem[tree].children[k]!=NO_CHILD)
1361  {
1362  int32_t child=TreeMem[tree].children[k];
1363  //continue recursion if not yet at max_degree, else add to result
1364  if (d<max_degree-1)
1365  compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1366  else
1367  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
1368 
1370  if (d==0)
1371  compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
1372  }
1373  }
1374  }
1375  else if (j==degree-1)
1376  {
1377  for (int32_t k=0; k<num_sym; k++)
1378  {
1379  //continue recursion if not yet at max_degree, else add to result
1380  if (d<max_degree-1 && i<num_feat-1)
1381  compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1382  else
1383  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
1384  }
1385  }
1386  }
1387 }
1388 
1389  template <class Trie>
1391  int32_t tree, const int32_t p, struct TreeParseInfo info,
1392  const int32_t depth, int32_t* const x, const int32_t k)
1393 {
1394  const int32_t num_sym = info.num_sym;
1395  const int32_t y0 = info.y0;
1396  const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
1397  //const int32_t temp = info.substrs[depth]*num_sym - ( (depth<=k) ? 0 : info.nofsKmers[k] * x[depth-k-1] );
1398  //if( !( info.y0 == temp ) ) {
1399  // printf( "\n temp=%d y0=%d k=%d depth=%d \n", temp, info.y0, k, depth );
1400  //}
1401  //ASSERT( info.y0 == temp );
1402  int32_t sym;
1403  ASSERT( depth < degree );
1404  //ASSERT( 0 <= info.substrs[depth] && info.substrs[depth] < info.nofsKmers[k] );
1405  if (depth<degree-1)
1406  {
1407  for( sym=0; sym<num_sym; ++sym ) {
1408  const int32_t childNum = TreeMem[tree].children[ sym ];
1409  if( childNum != NO_CHILD ) {
1410  int32_t child = childNum ;
1411  x[depth] = sym;
1412  info.substrs[depth+1] = y0 + sym;
1413  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1414  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) );
1415  count( TreeMem[child].weight, depth, info, p, x, k );
1416  traverse( child, p, info, depth+1, x, k );
1417  x[depth] = -1;
1418  }
1419  }
1420  }
1421  else if( depth == degree-1 )
1422  {
1423  for( sym=0; sym<num_sym; ++sym ) {
1424  const float64_t w = TreeMem[tree].child_weights[ sym ];
1425  if( w != 0.0 ) {
1426  x[depth] = sym;
1427  info.substrs[depth+1] = y0 + sym;
1428  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1429  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) );
1430  count( w, depth, info, p, x, k );
1431  x[depth] = -1;
1432  }
1433  }
1434  }
1435  //info.substrs[depth+1] = -1;
1436  //info.y0 = temp;
1437 }
1438 
1439  template <class Trie>
1441  const float64_t w, const int32_t depth, const struct TreeParseInfo info,
1442  const int32_t p, int32_t* x, const int32_t k)
1443 {
1444  ASSERT( fabs(w) < 1e10 );
1445  ASSERT( x[depth] >= 0 );
1446  ASSERT( x[depth+1] < 0 );
1447  if ( depth < k ) {
1448  return;
1449  }
1450  //ASSERT( info.margFactors[ depth-k ] == pow( 0.25, depth-k ) );
1451  const int32_t nofKmers = info.nofsKmers[k];
1452  const float64_t margWeight = w * info.margFactors[ depth-k ];
1453  const int32_t m_a = depth - k + 1;
1454  const int32_t m_b = info.num_feat - p;
1455  const int32_t m = ( m_a < m_b ) ? m_a : m_b;
1456  // all proper k-substrings
1457  const int32_t offset0 = nofKmers * p;
1458  register int32_t i;
1459  register int32_t offset;
1460  offset = offset0;
1461  for( i = 0; i < m; ++i ) {
1462  const int32_t y = info.substrs[i+k+1];
1463  info.C_k[ y + offset ] += margWeight;
1464  offset += nofKmers;
1465  }
1466  if( depth > k ) {
1467  // k-prefix
1468  const int32_t offsR = info.substrs[k+1] + offset0;
1469  info.R_k[offsR] += margWeight;
1470  // k-suffix
1471  if( p+depth-k < info.num_feat ) {
1472  const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
1473  info.L_k[offsL] += margWeight;
1474  }
1475  }
1476  // # N.x = substring represented by N
1477  // # N.d = length of N.x
1478  // # N.s = starting position of N.x
1479  // # N.w = weight for feature represented by N
1480  // if( N.d >= k )
1481  // margContrib = w / 4^(N.d-k)
1482  // for i = 1 to (N.d-k+1)
1483  // y = N.x[i:(i+k-1)] # overlapped k-mer
1484  // C_k[ N.s+i-1, y ] += margContrib
1485  // end;
1486  // if( N.d > k )
1487  // L_k[ N.s+N.d-k, N.x[N.d-k+(1:k)] ] += margContrib # j-suffix of N.x
1488  // R_k[ N.s, N.x[1:k] ] += margContrib # j-prefix of N.x
1489  // end;
1490  // end;
1491 }
1492 
1493  template <class Trie>
1495  int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
1496  float64_t *weights, bool degree_times_position_weights)
1497 {
1498  int32_t tree = trees[i] ;
1499  //ASSERT(seq_offset==0) ;
1500 
1501  int32_t max_depth = 0 ;
1502  float64_t* weights_column ;
1503  if (degree_times_position_weights)
1504  weights_column = &weights[(i+seq_offset)*degree] ;
1505  else
1506  weights_column = weights ;
1507 
1508  if (weights_in_tree)
1509  {
1510  for (int32_t j=0; (j<degree) && (i+j<length); j++)
1511  if (CMath::abs(weights_column[j]*alpha)>0)
1512  max_depth = j+1 ;
1513  }
1514  else
1515  // don't use the weights
1516  max_depth=degree ;
1517 
1518  for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
1519  {
1520  TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4)) ;
1521  if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
1522  {
1523  if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
1524  {
1525  // special treatment of the next nodes
1526  TRIE_ASSERT(j >= degree-16) ;
1527  // get the right element
1528  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1529  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1530  int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
1531 
1532  TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax)) ;
1533  TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq) ;
1534  TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats) ;
1535 
1536  // check whether the same string is stored
1537  int32_t mismatch_pos = -1 ;
1538  {
1539  int32_t k ;
1540  for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
1541  {
1542  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
1543  // ###
1544  if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
1545  fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
1546  TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER)) ;
1547  TRIE_ASSERT(k<16) ;
1548  if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
1549  {
1550  mismatch_pos=k ;
1551  break ;
1552  }
1553  }
1554  }
1555 
1556  // what happens when the .seq sequence is longer than vec? should we branch???
1557 
1558  if (mismatch_pos==-1)
1559  // if so, then just increase the weight by alpha and stop
1560  TreeMem[node].weight+=alpha ;
1561  else
1562  // otherwise
1563  // 1. replace current node with new node
1564  // 2. create new nodes until mismatching positon
1565  // 2. add a branch with old string (old node) and the new string (new node)
1566  {
1567  // replace old node
1568  int32_t last_node=tree ;
1569 
1570  // create new nodes until mismatch
1571  int32_t k ;
1572  for (k=0; k<mismatch_pos; k++)
1573  {
1574  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
1575  TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k]) ;
1576 
1577  int32_t tmp=get_node();
1578  TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
1579  last_node=tmp ;
1580  if (weights_in_tree)
1581  TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
1582  else
1583  TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
1584  TRIE_ASSERT(j+k!=degree-1) ;
1585  }
1586  if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
1587  fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
1588  ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER)) ;
1589  TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos]) ;
1590 
1591  if (j+k==degree-1)
1592  {
1593  // init child weights with zero if after dropping out
1594  // of the k<mismatch_pos loop we are one level below degree
1595  // (keep this even after get_node() change!)
1596  for (int32_t q=0; q<4; q++)
1597  TreeMem[last_node].child_weights[q]=0.0 ;
1598  if (weights_in_tree)
1599  {
1600  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1601  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
1602  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
1603  }
1604  else
1605  {
1606  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1607  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
1608  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
1609  }
1610 
1611 #ifdef TRIE_CHECK_EVERYTHING
1612  TreeMem[last_node].has_floats=true ;
1613 #endif
1614  }
1615  else
1616  {
1617  // the branch for the existing string
1618  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1619  {
1620  TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
1621 
1622  // move string by mismatch_pos positions
1623  for (int32_t q=0; q<16; q++)
1624  {
1625  if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
1626  TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
1627  else
1628  TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
1629  }
1630 #ifdef TRIE_CHECK_EVERYTHING
1631  TreeMem[node].has_seq=true ;
1632 #endif
1633  }
1634 
1635  // the new branch
1636  TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4)) ;
1637  int32_t tmp = get_node() ;
1638  TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
1639  last_node=tmp ;
1640 
1641  TreeMem[last_node].weight = alpha ;
1642 #ifdef TRIE_CHECK_EVERYTHING
1643  TreeMem[last_node].has_seq = true ;
1644 #endif
1645  memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1646  for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
1647  TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
1648  }
1649  }
1650  break ;
1651  }
1652  else
1653  {
1654  tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
1655  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
1656  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1657  if (weights_in_tree)
1658  TreeMem[tree].weight += alpha*weights_column[j];
1659  else
1660  TreeMem[tree].weight += alpha ;
1661  }
1662  }
1663  else if (j==degree-1)
1664  {
1665  // special treatment of the last node
1666  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1667  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
1668  if (weights_in_tree)
1669  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
1670  else
1671  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
1672 
1673  break;
1674  }
1675  else
1676  {
1677  bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
1678  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1679  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1680 
1681  int32_t tmp = get_node((j==degree-2) && (!use_seq));
1682  if (use_seq)
1683  TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
1684  else
1685  TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
1686  tree=tmp ;
1687 
1688  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
1689 #ifdef TRIE_CHECK_EVERYTHING
1690  TreeMem[tree].has_seq = use_seq ;
1691 #endif
1692  if (use_seq)
1693  {
1694  TreeMem[tree].weight = alpha ;
1695  // important to have the terminal characters (see ###)
1696  memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1697  for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
1698  {
1699  TRIE_ASSERT(q<16) ;
1700  TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
1701  }
1702  break ;
1703  }
1704  else
1705  {
1706  if (weights_in_tree)
1707  TreeMem[tree].weight = alpha*weights_column[j] ;
1708  else
1709  TreeMem[tree].weight = alpha ;
1710 #ifdef TRIE_CHECK_EVERYTHING
1711  if (j==degree-2)
1712  TreeMem[tree].has_floats = true ;
1713 #endif
1714  }
1715  }
1716  }
1717 }
1718 
1719  template <class Trie>
1721  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1722  int32_t weight_pos, float64_t* weights,
1723  bool degree_times_position_weights)
1724 {
1725  int32_t tree = trees[tree_pos] ;
1726 
1727  if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
1728  return 0.0;
1729 
1730  float64_t *weights_column=NULL ;
1731  if (degree_times_position_weights)
1732  weights_column=&weights[weight_pos*degree] ;
1733  else // weights is a vector (1 x degree)
1734  weights_column=weights ;
1735 
1736  float64_t sum=0 ;
1737  for (int32_t j=0; seq_pos+j < len; j++)
1738  {
1739  TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0)) ;
1740 
1741  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1742  {
1743  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1744  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1745  {
1746  tree = - TreeMem[tree].children[vec[seq_pos+j]];
1747  TRIE_ASSERT(tree>=0) ;
1748  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
1749  float64_t this_weight=0.0 ;
1750  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1751  {
1752  TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0)) ;
1753  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1754  break ;
1755  this_weight += weights_column[j+k] ;
1756  }
1757  sum += TreeMem[tree].weight * this_weight ;
1758  break ;
1759  }
1760  else
1761  {
1762  tree=TreeMem[tree].children[vec[seq_pos+j]];
1763  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1764  if (weights_in_tree)
1765  sum += TreeMem[tree].weight ;
1766  else
1767  sum += TreeMem[tree].weight * weights_column[j] ;
1768  } ;
1769  }
1770  else
1771  {
1772  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1773  if (j==degree-1)
1774  {
1775  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
1776  if (weights_in_tree)
1777  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1778  else
1779  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
1780  }
1781  else
1782  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
1783 
1784  break;
1785  }
1786  }
1787 
1788  if (position_weights!=NULL)
1789  return sum*position_weights[weight_pos] ;
1790  else
1791  return sum ;
1792 }
1793 
1794  template <class Trie>
1796  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1797  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
1798  int32_t mkl_stepsize, float64_t * weights,
1799  bool degree_times_position_weights)
1800 {
1801  int32_t tree = trees[tree_pos] ;
1802  if (factor==0)
1803  return ;
1804 
1805  if (position_weights!=NULL)
1806  {
1807  factor *= position_weights[weight_pos] ;
1808  if (factor==0)
1809  return ;
1810  if (!degree_times_position_weights) // with position_weigths, weights is a vector (1 x degree)
1811  {
1812  for (int32_t j=0; seq_pos+j<len; j++)
1813  {
1814  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1815  {
1816  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1817  {
1818  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1819  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
1820  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1821  {
1822  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1823  break ;
1824  if (weights_in_tree)
1825  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1826  else
1827  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1828  }
1829  break ;
1830  }
1831  else
1832  {
1833  tree=TreeMem[tree].children[vec[seq_pos+j]];
1834  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1835  if (weights_in_tree)
1836  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1837  else
1838  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1839  }
1840  }
1841  else
1842  {
1843  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1844  if (j==degree-1)
1845  {
1846  if (weights_in_tree)
1847  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1848  else
1849  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1850  }
1851  }
1852  }
1853  }
1854  else // with position_weigths, weights is a matrix (len x degree)
1855  {
1856  for (int32_t j=0; seq_pos+j<len; j++)
1857  {
1858  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1859  {
1860  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1861  {
1862  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1863  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
1864  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1865  {
1866  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1867  break ;
1868  if (weights_in_tree)
1869  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1870  else
1871  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1872  }
1873  break ;
1874  }
1875  else
1876  {
1877  tree=TreeMem[tree].children[vec[seq_pos+j]];
1878  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1879  if (weights_in_tree)
1880  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1881  else
1882  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
1883  }
1884  }
1885  else
1886  {
1887  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1888  if (j==degree-1)
1889  {
1890  if (weights_in_tree)
1891  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1892  else
1893  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
1894  }
1895  break ;
1896  }
1897  }
1898  }
1899  }
1900  else if (!degree_times_position_weights) // no position_weigths, weights is a vector (1 x degree)
1901  {
1902  for (int32_t j=0; seq_pos+j<len; j++)
1903  {
1904  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1905  {
1906  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1907  {
1908  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1909  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
1910  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1911  {
1912  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1913  break ;
1914  if (weights_in_tree)
1915  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1916  else
1917  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1918  }
1919  break ;
1920  }
1921  else
1922  {
1923  tree=TreeMem[tree].children[vec[seq_pos+j]];
1924  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1925  if (weights_in_tree)
1926  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
1927  else
1928  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1929  }
1930  }
1931  else
1932  {
1933  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1934  if (j==degree-1)
1935  {
1936  if (weights_in_tree)
1937  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1938  else
1939  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1940  }
1941  break ;
1942  }
1943  }
1944  }
1945  else // no position_weigths, weights is a matrix (len x degree)
1946  {
1947  /*if (!position_mask)
1948  {
1949  position_mask = SG_MALLOC(bool, len);
1950  for (int32_t i=0; i<len; i++)
1951  {
1952  position_mask[i]=false ;
1953 
1954  for (int32_t j=0; j<degree; j++)
1955  if (weights[i*degree+j]!=0.0)
1956  {
1957  position_mask[i]=true ;
1958  break ;
1959  }
1960  }
1961  }
1962  if (position_mask[weight_pos]==0)
1963  return ;*/
1964 
1965  for (int32_t j=0; seq_pos+j<len; j++)
1966  {
1967  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1968  {
1969  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1970  {
1971  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1972  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
1973  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1974  {
1975  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1976  break ;
1977  if (weights_in_tree)
1978  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1979  else
1980  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1981  }
1982  break ;
1983  }
1984  else
1985  {
1986  tree=TreeMem[tree].children[vec[seq_pos+j]];
1987  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1988  if (weights_in_tree)
1989  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
1990  else
1991  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
1992  }
1993  }
1994  else
1995  {
1996  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
1997  if (j==degree-1)
1998  {
1999  if (weights_in_tree)
2000  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
2001  else
2002  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
2003  }
2004  break ;
2005  }
2006  }
2007  }
2008 }
2009 
2010  template <class Trie>
2012  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
2013  DynArray<ConsensusEntry>* table, float64_t* weights)
2014 {
2015  float64_t w=1.0;
2016 
2017  if (weights_in_tree || depth==0)
2018  value+=tree->weight;
2019  else
2020  {
2021  w=weights[degree-1];
2022  value+=weights[depth-1]*tree->weight;
2023  }
2024 
2025  if (degree-1==depth)
2026  {
2027  for (int32_t sym=0; sym<4; sym++)
2028  {
2029  float64_t v=w*tree->child_weights[sym];
2030  if (v!=0.0)
2031  {
2032  ConsensusEntry entry;
2033  entry.bt=-1;
2034  entry.score=value+v;
2035  entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2036 
2037  table->append_element(entry);
2038  }
2039  }
2040  }
2041  else
2042  {
2043  for (int32_t sym=0; sym<4; sym++)
2044  {
2045  uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2046  if (tree->children[sym] != NO_CHILD)
2047  fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
2048  }
2049  }
2050 }
2051 
2052  template <class Trie>
2054  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
2055 {
2056  float64_t result=0.0;
2057 
2058  //SG_PRINT("pos:%i length:%i deg:%i seq:0x%0llx...\n", pos, length, deg, seq);
2059 
2060  for (int32_t i=pos; i<pos+deg && i<length; i++)
2061  {
2062  //SG_PRINT("loop %d\n", i);
2063  Trie* tree = &TreeMem[trees[i]];
2064 
2065  for (int32_t d=0; d<deg-i+pos; d++)
2066  {
2067  //SG_PRINT("loop degree %d shit: %d\n", d, (2*(deg-1-d-i+pos)));
2068  ASSERT(d-1<degree);
2069  int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
2070 
2071  float64_t w=1.0;
2072  if (!weights_in_tree)
2073  w=weights[d];
2074 
2075  ASSERT(tree->children[sym] != NO_CHILD);
2076  tree=&TreeMem[tree->children[sym]];
2077  result+=w*tree->weight;
2078  }
2079  }
2080  //SG_PRINT("cum: %f\n", result);
2081  return result;
2082 }
2083 
2084  template <class Trie>
2086  int32_t pos, DynArray<ConsensusEntry>* prev,
2087  DynArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
2088 {
2089  ASSERT(pos>=0 && pos<length);
2090  ASSERT(!use_compact_terminal_nodes);
2091 
2092  Trie* t = &TreeMem[trees[pos]];
2093 
2094  fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
2095 
2096 
2097  if (cumulative)
2098  {
2099  int32_t num_cur=cur->get_num_elements();
2100  for (int32_t i=0; i<num_cur; i++)
2101  {
2102  ConsensusEntry entry=cur->get_element(i);
2103  entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
2104  cur->set_element(entry,i);
2105  //SG_PRINT("cum: str:0%0llx sc:%f bt:%d\n",entry.string,entry.score,entry.bt);
2106  }
2107  }
2108 
2109  //if previous tree exists find maximum scoring path
2110  //for each element in cur and update bt table
2111  if (prev)
2112  {
2113  int32_t num_cur=cur->get_num_elements();
2114  int32_t num_prev=prev->get_num_elements();
2115 
2116  for (int32_t i=0; i<num_cur; i++)
2117  {
2118  //uint64_t str_cur_old= cur->get_element(i).string;
2119  uint64_t str_cur= cur->get_element(i).string >> 2;
2120  //SG_PRINT("...cur:0x%0llx cur_noprfx:0x%0llx...\n", str_cur_old, str_cur);
2121 
2122  int32_t bt=-1;
2123  float64_t max_score=0.0;
2124 
2125  for (int32_t j=0; j<num_prev; j++)
2126  {
2127  //uint64_t str_prev_old= prev->get_element(j).string;
2128  uint64_t mask=
2129  ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
2130  uint64_t str_prev= mask & prev->get_element(j).string;
2131  //SG_PRINT("...prev:0x%0llx prev_nosfx:0x%0llx mask:%0llx...\n", str_prev_old, str_prev,mask);
2132 
2133  if (str_cur == str_prev)
2134  {
2135  float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
2136  if (bt==-1 || sc>max_score)
2137  {
2138  bt=j;
2139  max_score=sc;
2140 
2141  //SG_PRINT("new_max[%i,%i] = %f\n", j,i, max_score);
2142  }
2143  }
2144  }
2145 
2146  ASSERT(bt!=-1);
2147  ConsensusEntry entry;
2148  entry.bt=bt;
2149  entry.score=max_score;
2150  entry.string=cur->get_element(i).string;
2151  cur->set_element(entry, i);
2152  //SG_PRINT("entry[%d]: str:0%0llx sc:%f bt:%d\n",i, entry.string,entry.score,entry.bt);
2153  }
2154  }
2155 }
2156 }
2157 #endif // _TRIE_H___

SHOGUN Machine Learning Toolbox - Documentation