SHOGUN  4.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Trie.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2009 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #ifndef _TRIE_H___
13 #define _TRIE_H___
14 
15 #include <shogun/lib/config.h>
16 
17 #include <shogun/lib/common.h>
18 #include <shogun/io/SGIO.h>
19 #include <shogun/base/DynArray.h>
21 #include <shogun/base/SGObject.h>
22 
23 namespace shogun
24 {
25 #ifndef DOXYGEN_SHOULD_SKIP_THIS
26 
27 // sentinel is 0xFFFFFFFC or float -2
28 #define NO_CHILD ((int32_t)-1073741824)
29 
30 #define WEIGHTS_IN_TRIE
31 //#define TRIE_CHECK_EVERYTHING
32 
33 #ifdef TRIE_CHECK_EVERYTHING
34 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
35 #else
36 #define TRIE_ASSERT_EVERYTHING(x)
37 #endif
38 
39 //#define TRIE_ASSERT(x) ASSERT(x)
40 #define TRIE_ASSERT(x)
41 
42 #define TRIE_TERMINAL_CHARACTER 7
43 
45 struct ConsensusEntry
46 {
48  uint64_t string;
50  float32_t score;
52  int32_t bt;
53 };
54 
56 struct POIMTrie
57 {
59  float64_t weight;
60 #ifdef TRIE_CHECK_EVERYTHING
61 
62  bool has_seq;
64  bool has_floats;
65 #endif
66  union
67  {
69  float32_t child_weights[4];
71  int32_t children[4];
73  uint8_t seq[16] ;
74  };
75 
77  float64_t S;
79  float64_t L;
81  float64_t R;
82 };
83 
85 struct DNATrie
86 {
88  float64_t weight;
89 #ifdef TRIE_CHECK_EVERYTHING
90 
91  bool has_seq;
93  bool has_floats;
94 #endif
95  union
96  {
98  float32_t child_weights[4];
100  int32_t children[4];
102  uint8_t seq[16] ;
103  };
104 };
105 
107 struct TreeParseInfo {
109  int32_t num_sym;
111  int32_t num_feat;
113  int32_t p;
115  int32_t k;
117  int32_t* nofsKmers;
119  float64_t* margFactors;
121  int32_t* x;
123  int32_t* substrs;
125  int32_t y0;
127  float64_t* C_k;
129  float64_t* L_k;
131  float64_t* R_k;
132 };
133 
134 #endif // DOXYGEN_SHOULD_SKIP_THIS
135 
136 template <class Trie> class CTrie;
137 
138 #define IGNORE_IN_CLASSLIST
139 
157 IGNORE_IN_CLASSLIST template <class Trie> class CTrie : public CSGObject
158 {
159  public:
161  CTrie();
162 
169  CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
170 
172  CTrie(const CTrie & to_copy);
173  virtual ~CTrie();
174 
176  const CTrie & operator=(const CTrie & to_copy);
177 
185  bool compare_traverse(
186  int32_t node, const CTrie & other, int32_t other_node);
187 
193  bool compare(const CTrie & other);
194 
201  bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
202 
209  int32_t find_deepest_node(
210  int32_t start_node, int32_t &deepest_node) const;
211 
216  void display_node(int32_t node) const;
217 
219  void destroy();
220 
225  void set_degree(int32_t d);
226 
233  void create(int32_t len, bool p_use_compact_terminal_nodes=true);
234 
240  void delete_trees(bool p_use_compact_terminal_nodes=true);
241 
252  void add_to_trie(
253  int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
254  float64_t *weights, bool degree_times_position_weights);
255 
262  float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
263 
269  float64_t* compute_abs_weights(int32_t &len);
270 
284  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
285  int32_t weight_pos, float64_t * weights,
286  bool degree_times_position_weights) ;
287 
303  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
304  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
305  int32_t mkl_stepsize, float64_t * weights,
306  bool degree_times_position_weights);
307 
323  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
324  int32_t max_degree, int32_t num_feat, int32_t num_sym,
325  int32_t sym_offset, int32_t offs, float64_t* result);
326 
340  int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
341  int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
342  int32_t max_mismatch, float64_t * weights);
343 
353  void traverse(
354  int32_t tree, const int32_t p, struct TreeParseInfo info,
355  const int32_t depth, int32_t* const x, const int32_t k);
356 
366  void count(
367  const float64_t w, const int32_t depth,
368  const struct TreeParseInfo info, const int32_t p, int32_t* x,
369  const int32_t k);
370 
377  int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
378 
388  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
389 
400  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
401  DynArray<ConsensusEntry>* table, float64_t* weights);
402 
412  int32_t pos, DynArray<ConsensusEntry>* prev,
413  DynArray<ConsensusEntry>* cur, bool cumulative,
414  float64_t* weights);
415 
421  void POIMs_extract_W(float64_t* const* const W, const int32_t K);
422 
427  void POIMs_precalc_SLR(const float64_t* const distrib);
428 
438  void POIMs_get_SLR(
439  const int32_t parentIdx, const int32_t sym, const int32_t depth,
440  float64_t* S, float64_t* L, float64_t* R);
441 
448  void POIMs_add_SLR(
449  float64_t* const* const poims, const int32_t K,
450  const int32_t debug);
451 
457  {
459  }
460 
467  bool p_use_compact_terminal_nodes)
468  {
469  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
470  }
471 
476  inline int32_t get_num_used_nodes()
477  {
478  return TreeMemPtr;
479  }
480 
485  inline void set_position_weights(float64_t* p_position_weights)
486  {
487  position_weights=p_position_weights;
488  }
489 
494  inline int32_t get_node(bool last_node=false)
495  {
496  int32_t ret = TreeMemPtr++;
497  check_treemem() ;
498 
499  if (last_node)
500  {
501  for (int32_t q=0; q<4; q++)
502  TreeMem[ret].child_weights[q]=0.0;
503  }
504  else
505  {
506  for (int32_t q=0; q<4; q++)
507  TreeMem[ret].children[q]=NO_CHILD;
508  }
509 #ifdef TRIE_CHECK_EVERYTHING
510  TreeMem[ret].has_seq=false ;
511  TreeMem[ret].has_floats=false ;
512 #endif
513  TreeMem[ret].weight=0.0;
514  return ret ;
515  }
516 
518  inline void check_treemem()
519  {
520  if (TreeMemPtr+10 < TreeMemPtrMax)
521  return;
522  SG_DEBUG("Extending TreeMem from %i to %i elements\n",
523  TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
524  int32_t old_sz=TreeMemPtrMax;
525  TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
526  TreeMem = SG_REALLOC(Trie, TreeMem, old_sz, TreeMemPtrMax);
527  }
528 
533  inline void set_weights_in_tree(bool weights_in_tree_)
534  {
535  weights_in_tree = weights_in_tree_;
536  }
537 
542  inline bool get_weights_in_tree()
543  {
544  return weights_in_tree;
545  }
546 
557  const int32_t nodeIdx, const int32_t depth, const int32_t offset,
558  const int32_t y0, float64_t* const* const W, const int32_t K);
559 
573  const float64_t* const distrib, const int32_t i,
574  const int32_t nodeIdx, int32_t left_tries_idx[4],
575  const int32_t depth, int32_t const lastSym, float64_t* S,
576  float64_t* L, float64_t* R);
577 
578 
590  const float64_t* const distrib, const int32_t i,
591  const int32_t nodeIdx, int32_t left_tries_idx[4],
592  const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
593 
605  const int32_t nodeIdx, const int32_t depth,const int32_t i,
606  const int32_t y0, float64_t* const* const poims, const int32_t K,
607  const int32_t debug);
608 
623  float64_t* const* const poims, const int32_t K, const int32_t k,
624  const int32_t i, const int32_t y, const float64_t valW,
625  const float64_t valS, const float64_t valL, const float64_t valR,
626  const int32_t debug);
627 
629  virtual const char* get_name() const { return "Trie"; }
630 
631  public:
633  int32_t NUM_SYMS;
634 
635  protected:
637  int32_t length;
639  int32_t * trees;
640 
642  int32_t degree;
645 
647  Trie* TreeMem;
649  int32_t TreeMemPtr;
651  int32_t TreeMemPtrMax;
654 
657 
659  int32_t* nofsKmers;
660 };
661  template <class Trie>
663  : CSGObject(), degree(0), position_weights(NULL),
664  use_compact_terminal_nodes(false),
665  weights_in_tree(true)
666  {
667 
668  TreeMemPtrMax=0;
669  TreeMemPtr=0;
670  TreeMem=NULL;
671 
672  length=0;
673  trees=NULL;
674 
675  NUM_SYMS=4;
676  }
677 
678  template <class Trie>
679  CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
680  : CSGObject(), degree(d), position_weights(NULL),
681  use_compact_terminal_nodes(p_use_compact_terminal_nodes),
682  weights_in_tree(true)
683  {
684  TreeMemPtrMax=1024*1024/sizeof(Trie);
685  TreeMemPtr=0;
686  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
687 
688  length=0;
689  trees=NULL;
690 
691  NUM_SYMS=4;
692  }
693 
694  template <class Trie>
695  CTrie<Trie>::CTrie(const CTrie & to_copy)
696  : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
697  use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
698  {
699  if (to_copy.position_weights!=NULL)
700  {
702  /*SG_MALLOC(float64_t, to_copy.length);
703  for (int32_t i=0; i<to_copy.length; i++)
704  position_weights[i]=to_copy.position_weights[i]; */
705  }
706  else
707  position_weights=NULL;
708 
710  TreeMemPtr=to_copy.TreeMemPtr;
711  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
712  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
713 
714  length=to_copy.length;
715  trees=SG_MALLOC(int32_t, length);
716  for (int32_t i=0; i<length; i++)
717  trees[i]=to_copy.trees[i];
718 
719  NUM_SYMS=4;
720  }
721 
722  template <class Trie>
724 {
725  degree=to_copy.degree ;
726  use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
727 
728  SG_FREE(position_weights);
729  position_weights=NULL ;
730  if (to_copy.position_weights!=NULL)
731  {
732  position_weights=to_copy.position_weights ;
733  /*position_weights = SG_MALLOC(float64_t, to_copy.length);
734  for (int32_t i=0; i<to_copy.length; i++)
735  position_weights[i]=to_copy.position_weights[i] ;*/
736  }
737  else
738  position_weights=NULL ;
739 
740  TreeMemPtrMax=to_copy.TreeMemPtrMax ;
741  TreeMemPtr=to_copy.TreeMemPtr ;
742  SG_FREE(TreeMem) ;
743  TreeMem = SG_MALLOC(Trie, TreeMemPtrMax);
744  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
745 
746  length = to_copy.length ;
747  if (trees)
748  SG_FREE(trees);
749  trees=SG_MALLOC(int32_t, length);
750  for (int32_t i=0; i<length; i++)
751  trees[i]=to_copy.trees[i] ;
752 
753  return *this ;
754 }
755 
756 template <class Trie>
758  int32_t start_node, int32_t& deepest_node) const
759 {
760 #ifdef TRIE_CHECK_EVERYTHING
761  int32_t ret=0 ;
762  SG_DEBUG("start_node=%i\n", start_node)
763 
764  if (start_node==NO_CHILD)
765  {
766  for (int32_t i=0; i<length; i++)
767  {
768  int32_t my_deepest_node ;
769  int32_t depth=find_deepest_node(i, my_deepest_node) ;
770  SG_DEBUG("start_node %i depth=%i\n", i, depth)
771  if (depth>ret)
772  {
773  deepest_node=my_deepest_node ;
774  ret=depth ;
775  }
776  }
777  return ret ;
778  }
779 
780  if (TreeMem[start_node].has_seq)
781  {
782  for (int32_t q=0; q<16; q++)
783  if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
784  ret++ ;
785  deepest_node=start_node ;
786  return ret ;
787  }
788  if (TreeMem[start_node].has_floats)
789  {
790  deepest_node=start_node ;
791  return 1 ;
792  }
793 
794  for (int32_t q=0; q<4; q++)
795  {
796  int32_t my_deepest_node ;
797  if (TreeMem[start_node].children[q]==NO_CHILD)
798  continue ;
799  int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
800  if (depth>ret)
801  {
802  deepest_node=my_deepest_node ;
803  ret=depth ;
804  }
805  }
806  return ret ;
807 #else
808  SG_ERROR("not implemented\n")
809  return 0 ;
810 #endif
811 }
812 
813  template <class Trie>
815  int32_t start_node, int32_t depth, float64_t * weights)
816 {
817  SG_ERROR("code buggy\n")
818 
819  int32_t ret=0 ;
820 
821  if (start_node==NO_CHILD)
822  {
823  for (int32_t i=0; i<length; i++)
824  compact_nodes(i,1, weights) ;
825  return 0 ;
826  }
827  if (start_node<0)
828  return -1 ;
829 
830  if (depth==degree-1)
831  {
832  TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats)
833  int32_t num_used=0 ;
834  for (int32_t q=0; q<4; q++)
835  if (TreeMem[start_node].child_weights[q]!=0.0)
836  num_used++ ;
837  if (num_used>1)
838  return -1 ;
839  return 1 ;
840  }
841  TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats)
842 
843  int32_t num_used = 0 ;
844  int32_t q_used=-1 ;
845 
846  for (int32_t q=0; q<4; q++)
847  {
848  if (TreeMem[start_node].children[q]==NO_CHILD)
849  continue ;
850  num_used++ ;
851  q_used=q ;
852  }
853  if (num_used>1)
854  {
855  if (depth>=degree-2)
856  return -1 ;
857  for (int32_t q=0; q<4; q++)
858  {
859  if (TreeMem[start_node].children[q]==NO_CHILD)
860  continue ;
861  int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
862  if (num<=2)
863  continue ;
864  int32_t node=get_node() ;
865 
866  int32_t last_node=TreeMem[start_node].children[q] ;
867  if (weights_in_tree)
868  {
869  ASSERT(weights[depth]!=0.0)
870  TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
871  }
872  else
873  TreeMem[node].weight=TreeMem[last_node].weight ;
874 
875 #ifdef TRIE_CHECK_EVERYTHING
876  TreeMem[node].has_seq=true ;
877 #endif
878  memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
879  for (int32_t n=0; n<num; n++)
880  {
881  ASSERT(depth+n+1<=degree-1)
882  ASSERT(last_node!=NO_CHILD)
883  if (depth+n+1==degree-1)
884  {
885  TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats)
886  int32_t k ;
887  for (k=0; k<4; k++)
888  if (TreeMem[last_node].child_weights[k]!=0.0)
889  break ;
890  if (k==4)
891  break ;
892  TreeMem[node].seq[n]=k ;
893  break ;
894  }
895  else
896  {
897  TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats)
898  int32_t k ;
899  for (k=0; k<4; k++)
900  if (TreeMem[last_node].children[k]!=NO_CHILD)
901  break ;
902  if (k==4)
903  break ;
904  TreeMem[node].seq[n]=k ;
905  last_node=TreeMem[last_node].children[k] ;
906  }
907  }
908  TreeMem[start_node].children[q]=-node ;
909  }
910  return -1 ;
911  }
912  if (num_used==0)
913  return 0 ;
914 
915  ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
916  if (ret<0)
917  return ret ;
918  return ret+1 ;
919 }
920 
921 
922  template <class Trie>
924  int32_t node, const CTrie<Trie> & other, int32_t other_node)
925 {
926  SG_DEBUG("checking nodes %i and %i\n", node, other_node)
927  if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
928  {
929  SG_DEBUG("CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight)
930  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
931  display_node(node) ;
932  SG_DEBUG("============================================================\n")
933  other.display_node(other_node) ;
934  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
935  return false ;
936  }
937 
938 #ifdef TRIE_CHECK_EVERYTHING
939  if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
940  {
941  SG_DEBUG("CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq)
942  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
943  display_node(node) ;
944  SG_DEBUG("============================================================\n")
945  other.display_node(other_node) ;
946  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
947  return false ;
948  }
949  if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
950  {
951  SG_DEBUG("CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats)
952  return false ;
953  }
954  if (other.TreeMem[other_node].has_floats)
955  {
956  for (int32_t q=0; q<4; q++)
957  if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
958  {
959  SG_DEBUG("CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q])
960  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
961  display_node(node) ;
962  SG_DEBUG("============================================================\n")
963  other.display_node(other_node) ;
964  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
965  return false ;
966  }
967  }
968  if (other.TreeMem[other_node].has_seq)
969  {
970  for (int32_t q=0; q<16; q++)
971  if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
972  {
973  SG_DEBUG("CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q])
974  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
975  display_node(node) ;
976  SG_DEBUG("============================================================\n")
977  other.display_node(other_node) ;
978  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
979  return false ;
980  }
981  }
982  if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
983  {
984  for (int32_t q=0; q<4; q++)
985  {
986  if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
987  continue ;
988  if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
989  {
990  SG_DEBUG("CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q])
991  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
992  display_node(node) ;
993  SG_DEBUG("============================================================\n")
994  other.display_node(other_node) ;
995  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
996  return false ;
997  }
998  if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
999  return false ;
1000  }
1001  }
1002 #else
1003  SG_ERROR("not implemented\n")
1004 #endif
1005 
1006  return true ;
1007 }
1008 
1009  template <class Trie>
1011 {
1012  bool ret=true ;
1013  for (int32_t i=0; i<length; i++)
1014  if (!compare_traverse(trees[i], other, other.trees[i]))
1015  return false ;
1016  else
1017  SG_DEBUG("two tries at %i identical\n", i)
1018 
1019  return ret ;
1020 }
1021 
1022 template <class Trie>
1024  int32_t node, int32_t * trace, int32_t& trace_len) const
1025 {
1026 #ifdef TRIE_CHECK_EVERYTHING
1027  ASSERT(trace_len-1>=0)
1028  ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
1029  if (TreeMem[trace[trace_len-1]].has_seq)
1030  return false ;
1031  if (TreeMem[trace[trace_len-1]].has_floats)
1032  return false ;
1033 
1034  for (int32_t q=0; q<4; q++)
1035  {
1036  if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
1037  continue ;
1038  int32_t tl=trace_len+1 ;
1039  if (TreeMem[trace[trace_len-1]].children[q]>=0)
1040  trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
1041  else
1042  trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
1043 
1044  if (trace[trace_len]==node)
1045  {
1046  trace_len=tl ;
1047  return true ;
1048  }
1049  if (find_node(node, trace, tl))
1050  {
1051  trace_len=tl ;
1052  return true ;
1053  }
1054  }
1055  trace_len=0 ;
1056  return false ;
1057 #else
1058  SG_ERROR("not implemented\n")
1059  return false ;
1060 #endif
1061 }
1062 
1063 template <class Trie>
1064 void CTrie<Trie>::display_node(int32_t node) const
1065 {
1066 #ifdef TRIE_CHECK_EVERYTHING
1067  int32_t * trace=SG_MALLOC(int32_t, 2*degree);
1068  int32_t trace_len=-1 ;
1069  bool found = false ;
1070  int32_t tree=-1 ;
1071  for (tree=0; tree<length; tree++)
1072  {
1073  trace[0]=trees[tree] ;
1074  trace_len=1 ;
1075  found=find_node(node, trace, trace_len) ;
1076  if (found)
1077  break ;
1078  }
1079  ASSERT(found)
1080  SG_PRINT("position %i trace: ", tree)
1081 
1082  for (int32_t i=0; i<trace_len-1; i++)
1083  {
1084  int32_t branch=-1 ;
1085  for (int32_t q=0; q<4; q++)
1086  if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
1087  {
1088  branch=q;
1089  break ;
1090  }
1091  ASSERT(branch!=-1)
1092  char acgt[5]="ACGT" ;
1093  SG_PRINT("%c", acgt[branch])
1094  }
1095  SG_PRINT("\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats)
1096  if (TreeMem[node].has_floats)
1097  {
1098  for (int32_t q=0; q<4; q++)
1099  SG_PRINT("child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q])
1100  }
1101  if (TreeMem[node].has_seq)
1102  {
1103  for (int32_t q=0; q<16; q++)
1104  SG_PRINT("seq[%i] = %i\n", q, TreeMem[node].seq[q])
1105  }
1106  if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
1107  {
1108  for (int32_t q=0; q<4; q++)
1109  {
1110  if (TreeMem[node].children[q]!=NO_CHILD)
1111  {
1112  SG_PRINT("children[%i] = %i -> \n", q, TreeMem[node].children[q])
1113  display_node(abs(TreeMem[node].children[q])) ;
1114  }
1115  else
1116  SG_PRINT("children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q])
1117  }
1118 
1119  }
1120 
1121  SG_FREE(trace);
1122 #else
1123  SG_ERROR("not implemented\n")
1124 #endif
1125 }
1126 
1127 
1128 template <class Trie> CTrie<Trie>::~CTrie()
1129 {
1130  destroy() ;
1131 
1132  SG_FREE(TreeMem) ;
1133 }
1134 
1135 template <class Trie> void CTrie<Trie>::destroy()
1136 {
1137  if (trees!=NULL)
1138  {
1139  delete_trees();
1140  for (int32_t i=0; i<length; i++)
1141  trees[i] = NO_CHILD;
1142  SG_FREE(trees);
1143 
1144  TreeMemPtr=0;
1145  length=0;
1146  trees=NULL;
1147  }
1148 }
1149 
1150 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
1151 {
1152  delete_trees(get_use_compact_terminal_nodes());
1153  degree=d;
1154 }
1155 
1156 template <class Trie> void CTrie<Trie>::create(
1157  int32_t len, bool p_use_compact_terminal_nodes)
1158 {
1159  destroy();
1160 
1161  trees=SG_MALLOC(int32_t, len);
1162  TreeMemPtr=0 ;
1163  for (int32_t i=0; i<len; i++)
1164  trees[i]=get_node(degree==1);
1165  length = len ;
1166 
1167  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1168 }
1169 
1170 
1171 template <class Trie> void CTrie<Trie>::delete_trees(
1172  bool p_use_compact_terminal_nodes)
1173 {
1174  if (trees==NULL)
1175  return;
1176 
1177  TreeMemPtr=0 ;
1178  for (int32_t i=0; i<length; i++)
1179  trees[i]=get_node(degree==1);
1180 
1181  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1182 }
1183 
1184  template <class Trie>
1186 {
1187  float64_t ret=0 ;
1188 
1189  if (tree==NO_CHILD)
1190  return 0 ;
1191  TRIE_ASSERT(tree>=0)
1192 
1193  if (depth==degree-2)
1194  {
1195  ret+=(TreeMem[tree].weight) ;
1196 
1197  for (int32_t k=0; k<4; k++)
1198  ret+=(TreeMem[tree].child_weights[k]) ;
1199 
1200  return ret ;
1201  }
1202 
1203  ret+=(TreeMem[tree].weight) ;
1204 
1205  for (int32_t i=0; i<4; i++)
1206  if (TreeMem[tree].children[i]!=NO_CHILD)
1207  ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
1208 
1209  return ret ;
1210 }
1211 
1212 
1213  template <class Trie>
1215 {
1216  float64_t * sum=SG_MALLOC(float64_t, length*4);
1217  for (int32_t i=0; i<length*4; i++)
1218  sum[i]=0 ;
1219  len=length ;
1220 
1221  for (int32_t i=0; i<length; i++)
1222  {
1223  TRIE_ASSERT(trees[i]!=NO_CHILD)
1224  for (int32_t k=0; k<4; k++)
1225  {
1226  sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
1227  }
1228  }
1229 
1230  return sum ;
1231 }
1232 
1233  template <class Trie>
1235  int32_t tree, int32_t i, float64_t alpha,
1236  int32_t *vec, int32_t len_rem,
1237  int32_t degree_rec, int32_t mismatch_rec,
1238  int32_t max_mismatch, float64_t * weights)
1239 {
1240  if (tree==NO_CHILD)
1241  tree=trees[i] ;
1242  TRIE_ASSERT(tree!=NO_CHILD)
1243 
1244  if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
1245  return ;
1246  const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
1247 
1248  int32_t subtree = NO_CHILD ;
1249 
1250  if (degree_rec==degree-1)
1251  {
1252  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1253  if (weights_in_tree)
1254  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
1255  else
1256  if (weights[degree_rec]!=0.0)
1257  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1258  if (mismatch_rec+1<=max_mismatch)
1259  for (int32_t o=0; o<3; o++)
1260  {
1261  if (weights_in_tree)
1262  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1263  else
1264  if (weights[degree_rec]!=0.0)
1265  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1266  }
1267  return ;
1268  }
1269  else
1270  {
1271  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1272  if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
1273  {
1274  subtree=TreeMem[tree].children[vec[0]] ;
1275  if (weights_in_tree)
1276  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
1277  else
1278  if (weights[degree_rec]!=0.0)
1279  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1280  }
1281  else
1282  {
1283  int32_t tmp = get_node(degree_rec==degree-2);
1284  ASSERT(tmp>=0)
1285  TreeMem[tree].children[vec[0]]=tmp ;
1286  subtree=tmp ;
1287 #ifdef TRIE_CHECK_EVERYTHING
1288  if (degree_rec==degree-2)
1289  TreeMem[subtree].has_floats=true ;
1290 #endif
1291  if (weights_in_tree)
1292  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
1293  else
1294  if (weights[degree_rec]!=0.0)
1295  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
1296  else
1297  TreeMem[subtree].weight = 0.0 ;
1298  }
1299  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1300  &vec[1], len_rem-1,
1301  degree_rec+1, mismatch_rec, max_mismatch, weights) ;
1302 
1303  if (mismatch_rec+1<=max_mismatch)
1304  {
1305  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1306  for (int32_t o=0; o<3; o++)
1307  {
1308  int32_t ot = other[vec[0]][o] ;
1309  if (TreeMem[tree].children[ot]!=NO_CHILD)
1310  {
1311  subtree=TreeMem[tree].children[ot] ;
1312  if (weights_in_tree)
1313  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1314  else
1315  if (weights[degree_rec]!=0.0)
1316  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1317  }
1318  else
1319  {
1320  int32_t tmp = get_node(degree_rec==degree-2);
1321  ASSERT(tmp>=0)
1322  TreeMem[tree].children[ot]=tmp ;
1323  subtree=tmp ;
1324 #ifdef TRIE_CHECK_EVERYTHING
1325  if (degree_rec==degree-2)
1326  TreeMem[subtree].has_floats=true ;
1327 #endif
1328 
1329  if (weights_in_tree)
1330  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
1331  else
1332  if (weights[degree_rec]!=0.0)
1333  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
1334  else
1335  TreeMem[subtree].weight = 0.0 ;
1336  }
1337 
1338  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1339  &vec[1], len_rem-1,
1340  degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
1341  }
1342  }
1343  }
1344 }
1345 
1346  template <class Trie>
1348  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
1349  int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
1350  int32_t offs, float64_t* result)
1351 {
1352  if (i+j<num_feat)
1353  {
1354  float64_t decay=1.0; //no decay by default
1355  //if (j>d)
1356  // decay=pow(0.5,j); //marginalize out lower order matches
1357 
1358  if (j<degree-1)
1359  {
1360  for (int32_t k=0; k<num_sym; k++)
1361  {
1362  if (TreeMem[tree].children[k]!=NO_CHILD)
1363  {
1364  int32_t child=TreeMem[tree].children[k];
1365  //continue recursion if not yet at max_degree, else add to result
1366  if (d<max_degree-1)
1367  compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1368  else
1369  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
1370 
1372  if (d==0)
1373  compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
1374  }
1375  }
1376  }
1377  else if (j==degree-1)
1378  {
1379  for (int32_t k=0; k<num_sym; k++)
1380  {
1381  //continue recursion if not yet at max_degree, else add to result
1382  if (d<max_degree-1 && i<num_feat-1)
1383  compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1384  else
1385  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
1386  }
1387  }
1388  }
1389 }
1390 
1391  template <class Trie>
1393  int32_t tree, const int32_t p, struct TreeParseInfo info,
1394  const int32_t depth, int32_t* const x, const int32_t k)
1395 {
1396  const int32_t num_sym = info.num_sym;
1397  const int32_t y0 = info.y0;
1398  const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
1399  //const int32_t temp = info.substrs[depth]*num_sym - ( (depth<=k) ? 0 : info.nofsKmers[k] * x[depth-k-1] );
1400  //if( !( info.y0 == temp ) ) {
1401  // printf( "\n temp=%d y0=%d k=%d depth=%d \n", temp, info.y0, k, depth );
1402  //}
1403  //ASSERT( info.y0 == temp )
1404  int32_t sym;
1405  ASSERT( depth < degree )
1406  //ASSERT( 0 <= info.substrs[depth] && info.substrs[depth] < info.nofsKmers[k] )
1407  if (depth<degree-1)
1408  {
1409  for( sym=0; sym<num_sym; ++sym ) {
1410  const int32_t childNum = TreeMem[tree].children[ sym ];
1411  if( childNum != NO_CHILD ) {
1412  int32_t child = childNum ;
1413  x[depth] = sym;
1414  info.substrs[depth+1] = y0 + sym;
1415  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1416  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1417  count( TreeMem[child].weight, depth, info, p, x, k );
1418  traverse( child, p, info, depth+1, x, k );
1419  x[depth] = -1;
1420  }
1421  }
1422  }
1423  else if( depth == degree-1 )
1424  {
1425  for( sym=0; sym<num_sym; ++sym ) {
1426  const float64_t w = TreeMem[tree].child_weights[ sym ];
1427  if( w != 0.0 ) {
1428  x[depth] = sym;
1429  info.substrs[depth+1] = y0 + sym;
1430  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1431  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1432  count( w, depth, info, p, x, k );
1433  x[depth] = -1;
1434  }
1435  }
1436  }
1437  //info.substrs[depth+1] = -1;
1438  //info.y0 = temp;
1439 }
1440 
1441  template <class Trie>
1443  const float64_t w, const int32_t depth, const struct TreeParseInfo info,
1444  const int32_t p, int32_t* x, const int32_t k)
1445 {
1446  ASSERT( fabs(w) < 1e10 )
1447  ASSERT( x[depth] >= 0 )
1448  ASSERT( x[depth+1] < 0 )
1449  if ( depth < k ) {
1450  return;
1451  }
1452  //ASSERT( info.margFactors[ depth-k ] == pow( 0.25, depth-k ) )
1453  const int32_t nofKmers = info.nofsKmers[k];
1454  const float64_t margWeight = w * info.margFactors[ depth-k ];
1455  const int32_t m_a = depth - k + 1;
1456  const int32_t m_b = info.num_feat - p;
1457  const int32_t m = ( m_a < m_b ) ? m_a : m_b;
1458  // all proper k-substrings
1459  const int32_t offset0 = nofKmers * p;
1460  int32_t i;
1461  int32_t offset;
1462  offset = offset0;
1463  for( i = 0; i < m; ++i ) {
1464  const int32_t y = info.substrs[i+k+1];
1465  info.C_k[ y + offset ] += margWeight;
1466  offset += nofKmers;
1467  }
1468  if( depth > k ) {
1469  // k-prefix
1470  const int32_t offsR = info.substrs[k+1] + offset0;
1471  info.R_k[offsR] += margWeight;
1472  // k-suffix
1473  if( p+depth-k < info.num_feat ) {
1474  const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
1475  info.L_k[offsL] += margWeight;
1476  }
1477  }
1478  // # N.x = substring represented by N
1479  // # N.d = length of N.x
1480  // # N.s = starting position of N.x
1481  // # N.w = weight for feature represented by N
1482  // if( N.d >= k )
1483  // margContrib = w / 4^(N.d-k)
1484  // for i = 1 to (N.d-k+1)
1485  // y = N.x[i:(i+k-1)] # overlapped k-mer
1486  // C_k[ N.s+i-1, y ] += margContrib
1487  // end;
1488  // if( N.d > k )
1489  // L_k[ N.s+N.d-k, N.x[N.d-k+(1:k)] ] += margContrib # j-suffix of N.x
1490  // R_k[ N.s, N.x[1:k] ] += margContrib # j-prefix of N.x
1491  // end;
1492  // end;
1493 }
1494 
1495  template <class Trie>
1497  int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
1498  float64_t *weights, bool degree_times_position_weights)
1499 {
1500  int32_t tree = trees[i] ;
1501  //ASSERT(seq_offset==0)
1502 
1503  int32_t max_depth = 0 ;
1504  float64_t* weights_column ;
1505  if (degree_times_position_weights)
1506  weights_column = &weights[(i+seq_offset)*degree] ;
1507  else
1508  weights_column = weights ;
1509 
1510  if (weights_in_tree)
1511  {
1512  for (int32_t j=0; (j<degree) && (i+j<length); j++)
1513  if (CMath::abs(weights_column[j]*alpha)>0)
1514  max_depth = j+1 ;
1515  }
1516  else
1517  // don't use the weights
1518  max_depth=degree ;
1519 
1520  for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
1521  {
1522  TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4))
1523  if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
1524  {
1525  if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
1526  {
1527  // special treatment of the next nodes
1528  TRIE_ASSERT(j >= degree-16)
1529  // get the right element
1530  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1531  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1532  int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
1533 
1534  TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax))
1535  TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq)
1536  TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats)
1537 
1538  // check whether the same string is stored
1539  int32_t mismatch_pos = -1 ;
1540  {
1541  int32_t k ;
1542  for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
1543  {
1544  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1545  // ###
1546  if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
1547  fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
1548  TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER))
1549  TRIE_ASSERT(k<16)
1550  if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
1551  {
1552  mismatch_pos=k ;
1553  break ;
1554  }
1555  }
1556  }
1557 
1558  // what happens when the .seq sequence is longer than vec? should we branch???
1559 
1560  if (mismatch_pos==-1)
1561  // if so, then just increase the weight by alpha and stop
1562  TreeMem[node].weight+=alpha ;
1563  else
1564  // otherwise
1565  // 1. replace current node with new node
1566  // 2. create new nodes until mismatching positon
1567  // 2. add a branch with old string (old node) and the new string (new node)
1568  {
1569  // replace old node
1570  int32_t last_node=tree ;
1571 
1572  // create new nodes until mismatch
1573  int32_t k ;
1574  for (k=0; k<mismatch_pos; k++)
1575  {
1576  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1577  TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k])
1578 
1579  int32_t tmp=get_node();
1580  TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
1581  last_node=tmp ;
1582  if (weights_in_tree)
1583  TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
1584  else
1585  TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
1586  TRIE_ASSERT(j+k!=degree-1)
1587  }
1588  if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
1589  fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
1590  ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER))
1591  TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos])
1592 
1593  if (j+k==degree-1)
1594  {
1595  // init child weights with zero if after dropping out
1596  // of the k<mismatch_pos loop we are one level below degree
1597  // (keep this even after get_node() change!)
1598  for (int32_t q=0; q<4; q++)
1599  TreeMem[last_node].child_weights[q]=0.0 ;
1600  if (weights_in_tree)
1601  {
1602  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1603  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
1604  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
1605  }
1606  else
1607  {
1608  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1609  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
1610  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
1611  }
1612 
1613 #ifdef TRIE_CHECK_EVERYTHING
1614  TreeMem[last_node].has_floats=true ;
1615 #endif
1616  }
1617  else
1618  {
1619  // the branch for the existing string
1620  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1621  {
1622  TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
1623 
1624  // move string by mismatch_pos positions
1625  for (int32_t q=0; q<16; q++)
1626  {
1627  if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
1628  TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
1629  else
1630  TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
1631  }
1632 #ifdef TRIE_CHECK_EVERYTHING
1633  TreeMem[node].has_seq=true ;
1634 #endif
1635  }
1636 
1637  // the new branch
1638  TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4))
1639  int32_t tmp = get_node() ;
1640  TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
1641  last_node=tmp ;
1642 
1643  TreeMem[last_node].weight = alpha ;
1644 #ifdef TRIE_CHECK_EVERYTHING
1645  TreeMem[last_node].has_seq = true ;
1646 #endif
1647  memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1648  for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
1649  TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
1650  }
1651  }
1652  break ;
1653  }
1654  else
1655  {
1656  tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
1657  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1658  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1659  if (weights_in_tree)
1660  TreeMem[tree].weight += alpha*weights_column[j];
1661  else
1662  TreeMem[tree].weight += alpha ;
1663  }
1664  }
1665  else if (j==degree-1)
1666  {
1667  // special treatment of the last node
1668  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1669  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1670  if (weights_in_tree)
1671  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
1672  else
1673  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
1674 
1675  break;
1676  }
1677  else
1678  {
1679  bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
1680  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1681  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1682 
1683  int32_t tmp = get_node((j==degree-2) && (!use_seq));
1684  if (use_seq)
1685  TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
1686  else
1687  TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
1688  tree=tmp ;
1689 
1690  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1691 #ifdef TRIE_CHECK_EVERYTHING
1692  TreeMem[tree].has_seq = use_seq ;
1693 #endif
1694  if (use_seq)
1695  {
1696  TreeMem[tree].weight = alpha ;
1697  // important to have the terminal characters (see ###)
1698  memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1699  for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
1700  {
1701  TRIE_ASSERT(q<16)
1702  TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
1703  }
1704  break ;
1705  }
1706  else
1707  {
1708  if (weights_in_tree)
1709  TreeMem[tree].weight = alpha*weights_column[j] ;
1710  else
1711  TreeMem[tree].weight = alpha ;
1712 #ifdef TRIE_CHECK_EVERYTHING
1713  if (j==degree-2)
1714  TreeMem[tree].has_floats = true ;
1715 #endif
1716  }
1717  }
1718  }
1719 }
1720 
1721  template <class Trie>
1723  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1724  int32_t weight_pos, float64_t* weights,
1725  bool degree_times_position_weights)
1726 {
1727  int32_t tree = trees[tree_pos] ;
1728 
1729  if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
1730  return 0.0;
1731 
1732  float64_t *weights_column=NULL ;
1733  if (degree_times_position_weights)
1734  weights_column=&weights[weight_pos*degree] ;
1735  else // weights is a vector (1 x degree)
1736  weights_column=weights ;
1737 
1738  float64_t sum=0 ;
1739  for (int32_t j=0; seq_pos+j < len; j++)
1740  {
1741  TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0))
1742 
1743  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1744  {
1745  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1746  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1747  {
1748  tree = - TreeMem[tree].children[vec[seq_pos+j]];
1749  TRIE_ASSERT(tree>=0)
1750  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1751  float64_t this_weight=0.0 ;
1752  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1753  {
1754  TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0))
1755  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1756  break ;
1757  this_weight += weights_column[j+k] ;
1758  }
1759  sum += TreeMem[tree].weight * this_weight ;
1760  break ;
1761  }
1762  else
1763  {
1764  tree=TreeMem[tree].children[vec[seq_pos+j]];
1765  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1766  if (weights_in_tree)
1767  sum += TreeMem[tree].weight ;
1768  else
1769  sum += TreeMem[tree].weight * weights_column[j] ;
1770  } ;
1771  }
1772  else
1773  {
1774  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1775  if (j==degree-1)
1776  {
1777  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1778  if (weights_in_tree)
1779  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1780  else
1781  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
1782  }
1783  else
1784  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1785 
1786  break;
1787  }
1788  }
1789 
1790  if (position_weights!=NULL)
1791  return sum*position_weights[weight_pos] ;
1792  else
1793  return sum ;
1794 }
1795 
1796  template <class Trie>
1798  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1799  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
1800  int32_t mkl_stepsize, float64_t * weights,
1801  bool degree_times_position_weights)
1802 {
1803  int32_t tree = trees[tree_pos] ;
1804  if (factor==0)
1805  return ;
1806 
1807  if (position_weights!=NULL)
1808  {
1809  factor *= position_weights[weight_pos] ;
1810  if (factor==0)
1811  return ;
1812  if (!degree_times_position_weights) // with position_weigths, weights is a vector (1 x degree)
1813  {
1814  for (int32_t j=0; seq_pos+j<len; j++)
1815  {
1816  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1817  {
1818  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1819  {
1820  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1821  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1822  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1823  {
1824  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1825  break ;
1826  if (weights_in_tree)
1827  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1828  else
1829  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1830  }
1831  break ;
1832  }
1833  else
1834  {
1835  tree=TreeMem[tree].children[vec[seq_pos+j]];
1836  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1837  if (weights_in_tree)
1838  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1839  else
1840  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1841  }
1842  }
1843  else
1844  {
1845  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1846  if (j==degree-1)
1847  {
1848  if (weights_in_tree)
1849  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1850  else
1851  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1852  }
1853  }
1854  }
1855  }
1856  else // with position_weigths, weights is a matrix (len x degree)
1857  {
1858  for (int32_t j=0; seq_pos+j<len; j++)
1859  {
1860  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1861  {
1862  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1863  {
1864  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1865  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1866  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1867  {
1868  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1869  break ;
1870  if (weights_in_tree)
1871  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1872  else
1873  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1874  }
1875  break ;
1876  }
1877  else
1878  {
1879  tree=TreeMem[tree].children[vec[seq_pos+j]];
1880  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1881  if (weights_in_tree)
1882  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1883  else
1884  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
1885  }
1886  }
1887  else
1888  {
1889  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1890  if (j==degree-1)
1891  {
1892  if (weights_in_tree)
1893  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1894  else
1895  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
1896  }
1897  break ;
1898  }
1899  }
1900  }
1901  }
1902  else if (!degree_times_position_weights) // no position_weigths, weights is a vector (1 x degree)
1903  {
1904  for (int32_t j=0; seq_pos+j<len; j++)
1905  {
1906  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1907  {
1908  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1909  {
1910  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1911  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1912  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1913  {
1914  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1915  break ;
1916  if (weights_in_tree)
1917  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1918  else
1919  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1920  }
1921  break ;
1922  }
1923  else
1924  {
1925  tree=TreeMem[tree].children[vec[seq_pos+j]];
1926  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1927  if (weights_in_tree)
1928  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
1929  else
1930  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1931  }
1932  }
1933  else
1934  {
1935  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1936  if (j==degree-1)
1937  {
1938  if (weights_in_tree)
1939  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1940  else
1941  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1942  }
1943  break ;
1944  }
1945  }
1946  }
1947  else // no position_weigths, weights is a matrix (len x degree)
1948  {
1949  /*if (!position_mask)
1950  {
1951  position_mask = SG_MALLOC(bool, len);
1952  for (int32_t i=0; i<len; i++)
1953  {
1954  position_mask[i]=false ;
1955 
1956  for (int32_t j=0; j<degree; j++)
1957  if (weights[i*degree+j]!=0.0)
1958  {
1959  position_mask[i]=true ;
1960  break ;
1961  }
1962  }
1963  }
1964  if (position_mask[weight_pos]==0)
1965  return ;*/
1966 
1967  for (int32_t j=0; seq_pos+j<len; j++)
1968  {
1969  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1970  {
1971  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1972  {
1973  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1974  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1975  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1976  {
1977  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1978  break ;
1979  if (weights_in_tree)
1980  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1981  else
1982  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1983  }
1984  break ;
1985  }
1986  else
1987  {
1988  tree=TreeMem[tree].children[vec[seq_pos+j]];
1989  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1990  if (weights_in_tree)
1991  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
1992  else
1993  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
1994  }
1995  }
1996  else
1997  {
1998  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1999  if (j==degree-1)
2000  {
2001  if (weights_in_tree)
2002  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
2003  else
2004  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
2005  }
2006  break ;
2007  }
2008  }
2009  }
2010 }
2011 
2012  template <class Trie>
2014  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
2015  DynArray<ConsensusEntry>* table, float64_t* weights)
2016 {
2017  float64_t w=1.0;
2018 
2019  if (weights_in_tree || depth==0)
2020  value+=tree->weight;
2021  else
2022  {
2023  w=weights[degree-1];
2024  value+=weights[depth-1]*tree->weight;
2025  }
2026 
2027  if (degree-1==depth)
2028  {
2029  for (int32_t sym=0; sym<4; sym++)
2030  {
2031  float64_t v=w*tree->child_weights[sym];
2032  if (v!=0.0)
2033  {
2034  ConsensusEntry entry;
2035  entry.bt=-1;
2036  entry.score=value+v;
2037  entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2038 
2039  table->append_element(entry);
2040  }
2041  }
2042  }
2043  else
2044  {
2045  for (int32_t sym=0; sym<4; sym++)
2046  {
2047  uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2048  if (tree->children[sym] != NO_CHILD)
2049  fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
2050  }
2051  }
2052 }
2053 
2054  template <class Trie>
2056  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
2057 {
2058  float64_t result=0.0;
2059 
2060  //SG_PRINT("pos:%i length:%i deg:%i seq:0x%0llx...\n", pos, length, deg, seq)
2061 
2062  for (int32_t i=pos; i<pos+deg && i<length; i++)
2063  {
2064  //SG_PRINT("loop %d\n", i)
2065  Trie* tree = &TreeMem[trees[i]];
2066 
2067  for (int32_t d=0; d<deg-i+pos; d++)
2068  {
2069  //SG_PRINT("loop degree %d shit: %d\n", d, (2*(deg-1-d-i+pos)))
2070  ASSERT(d-1<degree)
2071  int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
2072 
2073  float64_t w=1.0;
2074  if (!weights_in_tree)
2075  w=weights[d];
2076 
2077  ASSERT(tree->children[sym] != NO_CHILD)
2078  tree=&TreeMem[tree->children[sym]];
2079  result+=w*tree->weight;
2080  }
2081  }
2082  //SG_PRINT("cum: %f\n", result)
2083  return result;
2084 }
2085 
2086  template <class Trie>
2088  int32_t pos, DynArray<ConsensusEntry>* prev,
2089  DynArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
2090 {
2091  ASSERT(pos>=0 && pos<length)
2092  ASSERT(!use_compact_terminal_nodes)
2093 
2094  Trie* t = &TreeMem[trees[pos]];
2095 
2096  fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
2097 
2098 
2099  if (cumulative)
2100  {
2101  int32_t num_cur=cur->get_num_elements();
2102  for (int32_t i=0; i<num_cur; i++)
2103  {
2104  ConsensusEntry entry=cur->get_element(i);
2105  entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
2106  cur->set_element(entry,i);
2107  //SG_PRINT("cum: str:0%0llx sc:%f bt:%d\n",entry.string,entry.score,entry.bt)
2108  }
2109  }
2110 
2111  //if previous tree exists find maximum scoring path
2112  //for each element in cur and update bt table
2113  if (prev)
2114  {
2115  int32_t num_cur=cur->get_num_elements();
2116  int32_t num_prev=prev->get_num_elements();
2117 
2118  for (int32_t i=0; i<num_cur; i++)
2119  {
2120  //uint64_t str_cur_old= cur->get_element(i).string;
2121  uint64_t str_cur= cur->get_element(i).string >> 2;
2122  //SG_PRINT("...cur:0x%0llx cur_noprfx:0x%0llx...\n", str_cur_old, str_cur)
2123 
2124  int32_t bt=-1;
2125  float64_t max_score=0.0;
2126 
2127  for (int32_t j=0; j<num_prev; j++)
2128  {
2129  //uint64_t str_prev_old= prev->get_element(j).string;
2130  uint64_t mask=
2131  ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
2132  uint64_t str_prev= mask & prev->get_element(j).string;
2133  //SG_PRINT("...prev:0x%0llx prev_nosfx:0x%0llx mask:%0llx...\n", str_prev_old, str_prev,mask)
2134 
2135  if (str_cur == str_prev)
2136  {
2137  float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
2138  if (bt==-1 || sc>max_score)
2139  {
2140  bt=j;
2141  max_score=sc;
2142 
2143  //SG_PRINT("new_max[%i,%i] = %f\n", j,i, max_score)
2144  }
2145  }
2146  }
2147 
2148  ASSERT(bt!=-1)
2149  ConsensusEntry entry;
2150  entry.bt=bt;
2151  entry.score=max_score;
2152  entry.string=cur->get_element(i).string;
2153  cur->set_element(entry, i);
2154  //SG_PRINT("entry[%d]: str:0%0llx sc:%f bt:%d\n",i, entry.string,entry.score,entry.bt)
2155  }
2156  }
2157 }
2158 }
2159 #endif // _TRIE_H___

SHOGUN Machine Learning Toolbox - Documentation