SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Trie.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2009 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #ifndef _TRIE_H___
13 #define _TRIE_H___
14 
15 #include <string.h>
16 #include <shogun/lib/common.h>
17 #include <shogun/io/SGIO.h>
18 #include <shogun/base/DynArray.h>
20 #include <shogun/base/SGObject.h>
21 
22 namespace shogun
23 {
24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
25 
26 // sentinel is 0xFFFFFFFC or float -2
27 #define NO_CHILD ((int32_t)-1073741824)
28 
29 #define WEIGHTS_IN_TRIE
30 //#define TRIE_CHECK_EVERYTHING
31 
32 #ifdef TRIE_CHECK_EVERYTHING
33 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
34 #else
35 #define TRIE_ASSERT_EVERYTHING(x)
36 #endif
37 
38 //#define TRIE_ASSERT(x) ASSERT(x)
39 #define TRIE_ASSERT(x)
40 
41 #define TRIE_TERMINAL_CHARACTER 7
42 
44 struct ConsensusEntry
45 {
47  uint64_t string;
49  float32_t score;
51  int32_t bt;
52 };
53 
55 struct POIMTrie
56 {
58  float64_t weight;
59 #ifdef TRIE_CHECK_EVERYTHING
60 
61  bool has_seq;
63  bool has_floats;
64 #endif
65  union
66  {
68  float32_t child_weights[4];
70  int32_t children[4];
72  uint8_t seq[16] ;
73  };
74 
76  float64_t S;
78  float64_t L;
80  float64_t R;
81 };
82 
84 struct DNATrie
85 {
87  float64_t weight;
88 #ifdef TRIE_CHECK_EVERYTHING
89 
90  bool has_seq;
92  bool has_floats;
93 #endif
94  union
95  {
97  float32_t child_weights[4];
99  int32_t children[4];
101  uint8_t seq[16] ;
102  };
103 };
104 
106 struct TreeParseInfo {
108  int32_t num_sym;
110  int32_t num_feat;
112  int32_t p;
114  int32_t k;
116  int32_t* nofsKmers;
118  float64_t* margFactors;
120  int32_t* x;
122  int32_t* substrs;
124  int32_t y0;
126  float64_t* C_k;
128  float64_t* L_k;
130  float64_t* R_k;
131 };
132 
133 #endif // DOXYGEN_SHOULD_SKIP_THIS
134 
135 template <class Trie> class CTrie;
136 
137 #define IGNORE_IN_CLASSLIST
138 
156 IGNORE_IN_CLASSLIST template <class Trie> class CTrie : public CSGObject
157 {
158  public:
160  CTrie();
161 
168  CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
169 
171  CTrie(const CTrie & to_copy);
172  virtual ~CTrie();
173 
175  const CTrie & operator=(const CTrie & to_copy);
176 
184  bool compare_traverse(
185  int32_t node, const CTrie & other, int32_t other_node);
186 
192  bool compare(const CTrie & other);
193 
200  bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
201 
208  int32_t find_deepest_node(
209  int32_t start_node, int32_t &deepest_node) const;
210 
215  void display_node(int32_t node) const;
216 
218  void destroy();
219 
224  void set_degree(int32_t d);
225 
232  void create(int32_t len, bool p_use_compact_terminal_nodes=true);
233 
239  void delete_trees(bool p_use_compact_terminal_nodes=true);
240 
251  void add_to_trie(
252  int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
253  float64_t *weights, bool degree_times_position_weights);
254 
261  float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
262 
268  float64_t* compute_abs_weights(int32_t &len);
269 
283  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
284  int32_t weight_pos, float64_t * weights,
285  bool degree_times_position_weights) ;
286 
302  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
303  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
304  int32_t mkl_stepsize, float64_t * weights,
305  bool degree_times_position_weights);
306 
322  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
323  int32_t max_degree, int32_t num_feat, int32_t num_sym,
324  int32_t sym_offset, int32_t offs, float64_t* result);
325 
339  int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
340  int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
341  int32_t max_mismatch, float64_t * weights);
342 
352  void traverse(
353  int32_t tree, const int32_t p, struct TreeParseInfo info,
354  const int32_t depth, int32_t* const x, const int32_t k);
355 
365  void count(
366  const float64_t w, const int32_t depth,
367  const struct TreeParseInfo info, const int32_t p, int32_t* x,
368  const int32_t k);
369 
376  int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
377 
387  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
388 
399  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
400  DynArray<ConsensusEntry>* table, float64_t* weights);
401 
411  int32_t pos, DynArray<ConsensusEntry>* prev,
412  DynArray<ConsensusEntry>* cur, bool cumulative,
413  float64_t* weights);
414 
420  void POIMs_extract_W(float64_t* const* const W, const int32_t K);
421 
426  void POIMs_precalc_SLR(const float64_t* const distrib);
427 
437  void POIMs_get_SLR(
438  const int32_t parentIdx, const int32_t sym, const int32_t depth,
439  float64_t* S, float64_t* L, float64_t* R);
440 
447  void POIMs_add_SLR(
448  float64_t* const* const poims, const int32_t K,
449  const int32_t debug);
450 
456  {
458  }
459 
466  bool p_use_compact_terminal_nodes)
467  {
468  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
469  }
470 
475  inline int32_t get_num_used_nodes()
476  {
477  return TreeMemPtr;
478  }
479 
484  inline void set_position_weights(float64_t* p_position_weights)
485  {
486  position_weights=p_position_weights;
487  }
488 
493  inline int32_t get_node(bool last_node=false)
494  {
495  int32_t ret = TreeMemPtr++;
496  check_treemem() ;
497 
498  if (last_node)
499  {
500  for (int32_t q=0; q<4; q++)
501  TreeMem[ret].child_weights[q]=0.0;
502  }
503  else
504  {
505  for (int32_t q=0; q<4; q++)
506  TreeMem[ret].children[q]=NO_CHILD;
507  }
508 #ifdef TRIE_CHECK_EVERYTHING
509  TreeMem[ret].has_seq=false ;
510  TreeMem[ret].has_floats=false ;
511 #endif
512  TreeMem[ret].weight=0.0;
513  return ret ;
514  }
515 
517  inline void check_treemem()
518  {
519  if (TreeMemPtr+10 < TreeMemPtrMax)
520  return;
521  SG_DEBUG("Extending TreeMem from %i to %i elements\n",
522  TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
523  int32_t old_sz=TreeMemPtrMax;
524  TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
525  TreeMem = SG_REALLOC(Trie, TreeMem, old_sz, TreeMemPtrMax);
526  }
527 
532  inline void set_weights_in_tree(bool weights_in_tree_)
533  {
534  weights_in_tree = weights_in_tree_;
535  }
536 
541  inline bool get_weights_in_tree()
542  {
543  return weights_in_tree;
544  }
545 
556  const int32_t nodeIdx, const int32_t depth, const int32_t offset,
557  const int32_t y0, float64_t* const* const W, const int32_t K);
558 
572  const float64_t* const distrib, const int32_t i,
573  const int32_t nodeIdx, int32_t left_tries_idx[4],
574  const int32_t depth, int32_t const lastSym, float64_t* S,
575  float64_t* L, float64_t* R);
576 
577 
589  const float64_t* const distrib, const int32_t i,
590  const int32_t nodeIdx, int32_t left_tries_idx[4],
591  const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
592 
604  const int32_t nodeIdx, const int32_t depth,const int32_t i,
605  const int32_t y0, float64_t* const* const poims, const int32_t K,
606  const int32_t debug);
607 
622  float64_t* const* const poims, const int32_t K, const int32_t k,
623  const int32_t i, const int32_t y, const float64_t valW,
624  const float64_t valS, const float64_t valL, const float64_t valR,
625  const int32_t debug);
626 
628  virtual const char* get_name() const { return "Trie"; }
629 
630  public:
632  int32_t NUM_SYMS;
633 
634  protected:
636  int32_t length;
638  int32_t * trees;
639 
641  int32_t degree;
644 
646  Trie* TreeMem;
648  int32_t TreeMemPtr;
650  int32_t TreeMemPtrMax;
653 
656 
658  int32_t* nofsKmers;
659 };
660  template <class Trie>
662  : CSGObject(), degree(0), position_weights(NULL),
663  use_compact_terminal_nodes(false),
664  weights_in_tree(true)
665  {
666 
667  TreeMemPtrMax=0;
668  TreeMemPtr=0;
669  TreeMem=NULL;
670 
671  length=0;
672  trees=NULL;
673 
674  NUM_SYMS=4;
675  }
676 
677  template <class Trie>
678  CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
679  : CSGObject(), degree(d), position_weights(NULL),
680  use_compact_terminal_nodes(p_use_compact_terminal_nodes),
681  weights_in_tree(true)
682  {
683  TreeMemPtrMax=1024*1024/sizeof(Trie);
684  TreeMemPtr=0;
685  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
686 
687  length=0;
688  trees=NULL;
689 
690  NUM_SYMS=4;
691  }
692 
693  template <class Trie>
694  CTrie<Trie>::CTrie(const CTrie & to_copy)
695  : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
696  use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
697  {
698  if (to_copy.position_weights!=NULL)
699  {
701  /*SG_MALLOC(float64_t, to_copy.length);
702  for (int32_t i=0; i<to_copy.length; i++)
703  position_weights[i]=to_copy.position_weights[i]; */
704  }
705  else
706  position_weights=NULL;
707 
709  TreeMemPtr=to_copy.TreeMemPtr;
710  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
711  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
712 
713  length=to_copy.length;
714  trees=SG_MALLOC(int32_t, length);
715  for (int32_t i=0; i<length; i++)
716  trees[i]=to_copy.trees[i];
717 
718  NUM_SYMS=4;
719  }
720 
721  template <class Trie>
723 {
724  degree=to_copy.degree ;
725  use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
726 
727  SG_FREE(position_weights);
728  position_weights=NULL ;
729  if (to_copy.position_weights!=NULL)
730  {
731  position_weights=to_copy.position_weights ;
732  /*position_weights = SG_MALLOC(float64_t, to_copy.length);
733  for (int32_t i=0; i<to_copy.length; i++)
734  position_weights[i]=to_copy.position_weights[i] ;*/
735  }
736  else
737  position_weights=NULL ;
738 
739  TreeMemPtrMax=to_copy.TreeMemPtrMax ;
740  TreeMemPtr=to_copy.TreeMemPtr ;
741  SG_FREE(TreeMem) ;
742  TreeMem = SG_MALLOC(Trie, TreeMemPtrMax);
743  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
744 
745  length = to_copy.length ;
746  if (trees)
747  SG_FREE(trees);
748  trees=SG_MALLOC(int32_t, length);
749  for (int32_t i=0; i<length; i++)
750  trees[i]=to_copy.trees[i] ;
751 
752  return *this ;
753 }
754 
755 template <class Trie>
757  int32_t start_node, int32_t& deepest_node) const
758 {
759 #ifdef TRIE_CHECK_EVERYTHING
760  int32_t ret=0 ;
761  SG_DEBUG("start_node=%i\n", start_node)
762 
763  if (start_node==NO_CHILD)
764  {
765  for (int32_t i=0; i<length; i++)
766  {
767  int32_t my_deepest_node ;
768  int32_t depth=find_deepest_node(i, my_deepest_node) ;
769  SG_DEBUG("start_node %i depth=%i\n", i, depth)
770  if (depth>ret)
771  {
772  deepest_node=my_deepest_node ;
773  ret=depth ;
774  }
775  }
776  return ret ;
777  }
778 
779  if (TreeMem[start_node].has_seq)
780  {
781  for (int32_t q=0; q<16; q++)
782  if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
783  ret++ ;
784  deepest_node=start_node ;
785  return ret ;
786  }
787  if (TreeMem[start_node].has_floats)
788  {
789  deepest_node=start_node ;
790  return 1 ;
791  }
792 
793  for (int32_t q=0; q<4; q++)
794  {
795  int32_t my_deepest_node ;
796  if (TreeMem[start_node].children[q]==NO_CHILD)
797  continue ;
798  int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
799  if (depth>ret)
800  {
801  deepest_node=my_deepest_node ;
802  ret=depth ;
803  }
804  }
805  return ret ;
806 #else
807  SG_ERROR("not implemented\n")
808  return 0 ;
809 #endif
810 }
811 
812  template <class Trie>
814  int32_t start_node, int32_t depth, float64_t * weights)
815 {
816  SG_ERROR("code buggy\n")
817 
818  int32_t ret=0 ;
819 
820  if (start_node==NO_CHILD)
821  {
822  for (int32_t i=0; i<length; i++)
823  compact_nodes(i,1, weights) ;
824  return 0 ;
825  }
826  if (start_node<0)
827  return -1 ;
828 
829  if (depth==degree-1)
830  {
831  TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats)
832  int32_t num_used=0 ;
833  for (int32_t q=0; q<4; q++)
834  if (TreeMem[start_node].child_weights[q]!=0.0)
835  num_used++ ;
836  if (num_used>1)
837  return -1 ;
838  return 1 ;
839  }
840  TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats)
841 
842  int32_t num_used = 0 ;
843  int32_t q_used=-1 ;
844 
845  for (int32_t q=0; q<4; q++)
846  {
847  if (TreeMem[start_node].children[q]==NO_CHILD)
848  continue ;
849  num_used++ ;
850  q_used=q ;
851  }
852  if (num_used>1)
853  {
854  if (depth>=degree-2)
855  return -1 ;
856  for (int32_t q=0; q<4; q++)
857  {
858  if (TreeMem[start_node].children[q]==NO_CHILD)
859  continue ;
860  int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
861  if (num<=2)
862  continue ;
863  int32_t node=get_node() ;
864 
865  int32_t last_node=TreeMem[start_node].children[q] ;
866  if (weights_in_tree)
867  {
868  ASSERT(weights[depth]!=0.0)
869  TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
870  }
871  else
872  TreeMem[node].weight=TreeMem[last_node].weight ;
873 
874 #ifdef TRIE_CHECK_EVERYTHING
875  TreeMem[node].has_seq=true ;
876 #endif
877  memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
878  for (int32_t n=0; n<num; n++)
879  {
880  ASSERT(depth+n+1<=degree-1)
881  ASSERT(last_node!=NO_CHILD)
882  if (depth+n+1==degree-1)
883  {
884  TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats)
885  int32_t k ;
886  for (k=0; k<4; k++)
887  if (TreeMem[last_node].child_weights[k]!=0.0)
888  break ;
889  if (k==4)
890  break ;
891  TreeMem[node].seq[n]=k ;
892  break ;
893  }
894  else
895  {
896  TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats)
897  int32_t k ;
898  for (k=0; k<4; k++)
899  if (TreeMem[last_node].children[k]!=NO_CHILD)
900  break ;
901  if (k==4)
902  break ;
903  TreeMem[node].seq[n]=k ;
904  last_node=TreeMem[last_node].children[k] ;
905  }
906  }
907  TreeMem[start_node].children[q]=-node ;
908  }
909  return -1 ;
910  }
911  if (num_used==0)
912  return 0 ;
913 
914  ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
915  if (ret<0)
916  return ret ;
917  return ret+1 ;
918 }
919 
920 
921  template <class Trie>
923  int32_t node, const CTrie<Trie> & other, int32_t other_node)
924 {
925  SG_DEBUG("checking nodes %i and %i\n", node, other_node)
926  if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
927  {
928  SG_DEBUG("CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight)
929  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
930  display_node(node) ;
931  SG_DEBUG("============================================================\n")
932  other.display_node(other_node) ;
933  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
934  return false ;
935  }
936 
937 #ifdef TRIE_CHECK_EVERYTHING
938  if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
939  {
940  SG_DEBUG("CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq)
941  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
942  display_node(node) ;
943  SG_DEBUG("============================================================\n")
944  other.display_node(other_node) ;
945  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
946  return false ;
947  }
948  if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
949  {
950  SG_DEBUG("CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats)
951  return false ;
952  }
953  if (other.TreeMem[other_node].has_floats)
954  {
955  for (int32_t q=0; q<4; q++)
956  if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
957  {
958  SG_DEBUG("CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q])
959  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
960  display_node(node) ;
961  SG_DEBUG("============================================================\n")
962  other.display_node(other_node) ;
963  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
964  return false ;
965  }
966  }
967  if (other.TreeMem[other_node].has_seq)
968  {
969  for (int32_t q=0; q<16; q++)
970  if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
971  {
972  SG_DEBUG("CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q])
973  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
974  display_node(node) ;
975  SG_DEBUG("============================================================\n")
976  other.display_node(other_node) ;
977  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
978  return false ;
979  }
980  }
981  if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
982  {
983  for (int32_t q=0; q<4; q++)
984  {
985  if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
986  continue ;
987  if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
988  {
989  SG_DEBUG("CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q])
990  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
991  display_node(node) ;
992  SG_DEBUG("============================================================\n")
993  other.display_node(other_node) ;
994  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
995  return false ;
996  }
997  if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
998  return false ;
999  }
1000  }
1001 #else
1002  SG_ERROR("not implemented\n")
1003 #endif
1004 
1005  return true ;
1006 }
1007 
1008  template <class Trie>
1010 {
1011  bool ret=true ;
1012  for (int32_t i=0; i<length; i++)
1013  if (!compare_traverse(trees[i], other, other.trees[i]))
1014  return false ;
1015  else
1016  SG_DEBUG("two tries at %i identical\n", i)
1017 
1018  return ret ;
1019 }
1020 
1021 template <class Trie>
1023  int32_t node, int32_t * trace, int32_t& trace_len) const
1024 {
1025 #ifdef TRIE_CHECK_EVERYTHING
1026  ASSERT(trace_len-1>=0)
1027  ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
1028  if (TreeMem[trace[trace_len-1]].has_seq)
1029  return false ;
1030  if (TreeMem[trace[trace_len-1]].has_floats)
1031  return false ;
1032 
1033  for (int32_t q=0; q<4; q++)
1034  {
1035  if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
1036  continue ;
1037  int32_t tl=trace_len+1 ;
1038  if (TreeMem[trace[trace_len-1]].children[q]>=0)
1039  trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
1040  else
1041  trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
1042 
1043  if (trace[trace_len]==node)
1044  {
1045  trace_len=tl ;
1046  return true ;
1047  }
1048  if (find_node(node, trace, tl))
1049  {
1050  trace_len=tl ;
1051  return true ;
1052  }
1053  }
1054  trace_len=0 ;
1055  return false ;
1056 #else
1057  SG_ERROR("not implemented\n")
1058  return false ;
1059 #endif
1060 }
1061 
1062 template <class Trie>
1063 void CTrie<Trie>::display_node(int32_t node) const
1064 {
1065 #ifdef TRIE_CHECK_EVERYTHING
1066  int32_t * trace=SG_MALLOC(int32_t, 2*degree);
1067  int32_t trace_len=-1 ;
1068  bool found = false ;
1069  int32_t tree=-1 ;
1070  for (tree=0; tree<length; tree++)
1071  {
1072  trace[0]=trees[tree] ;
1073  trace_len=1 ;
1074  found=find_node(node, trace, trace_len) ;
1075  if (found)
1076  break ;
1077  }
1078  ASSERT(found)
1079  SG_PRINT("position %i trace: ", tree)
1080 
1081  for (int32_t i=0; i<trace_len-1; i++)
1082  {
1083  int32_t branch=-1 ;
1084  for (int32_t q=0; q<4; q++)
1085  if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
1086  {
1087  branch=q;
1088  break ;
1089  }
1090  ASSERT(branch!=-1)
1091  char acgt[5]="ACGT" ;
1092  SG_PRINT("%c", acgt[branch])
1093  }
1094  SG_PRINT("\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats)
1095  if (TreeMem[node].has_floats)
1096  {
1097  for (int32_t q=0; q<4; q++)
1098  SG_PRINT("child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q])
1099  }
1100  if (TreeMem[node].has_seq)
1101  {
1102  for (int32_t q=0; q<16; q++)
1103  SG_PRINT("seq[%i] = %i\n", q, TreeMem[node].seq[q])
1104  }
1105  if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
1106  {
1107  for (int32_t q=0; q<4; q++)
1108  {
1109  if (TreeMem[node].children[q]!=NO_CHILD)
1110  {
1111  SG_PRINT("children[%i] = %i -> \n", q, TreeMem[node].children[q])
1112  display_node(abs(TreeMem[node].children[q])) ;
1113  }
1114  else
1115  SG_PRINT("children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q])
1116  }
1117 
1118  }
1119 
1120  SG_FREE(trace);
1121 #else
1122  SG_ERROR("not implemented\n")
1123 #endif
1124 }
1125 
1126 
1127 template <class Trie> CTrie<Trie>::~CTrie()
1128 {
1129  destroy() ;
1130 
1131  SG_FREE(TreeMem) ;
1132 }
1133 
1134 template <class Trie> void CTrie<Trie>::destroy()
1135 {
1136  if (trees!=NULL)
1137  {
1138  delete_trees();
1139  for (int32_t i=0; i<length; i++)
1140  trees[i] = NO_CHILD;
1141  SG_FREE(trees);
1142 
1143  TreeMemPtr=0;
1144  length=0;
1145  trees=NULL;
1146  }
1147 }
1148 
1149 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
1150 {
1151  delete_trees(get_use_compact_terminal_nodes());
1152  degree=d;
1153 }
1154 
1155 template <class Trie> void CTrie<Trie>::create(
1156  int32_t len, bool p_use_compact_terminal_nodes)
1157 {
1158  destroy();
1159 
1160  trees=SG_MALLOC(int32_t, len);
1161  TreeMemPtr=0 ;
1162  for (int32_t i=0; i<len; i++)
1163  trees[i]=get_node(degree==1);
1164  length = len ;
1165 
1166  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1167 }
1168 
1169 
1170 template <class Trie> void CTrie<Trie>::delete_trees(
1171  bool p_use_compact_terminal_nodes)
1172 {
1173  if (trees==NULL)
1174  return;
1175 
1176  TreeMemPtr=0 ;
1177  for (int32_t i=0; i<length; i++)
1178  trees[i]=get_node(degree==1);
1179 
1180  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1181 }
1182 
1183  template <class Trie>
1185 {
1186  float64_t ret=0 ;
1187 
1188  if (tree==NO_CHILD)
1189  return 0 ;
1190  TRIE_ASSERT(tree>=0)
1191 
1192  if (depth==degree-2)
1193  {
1194  ret+=(TreeMem[tree].weight) ;
1195 
1196  for (int32_t k=0; k<4; k++)
1197  ret+=(TreeMem[tree].child_weights[k]) ;
1198 
1199  return ret ;
1200  }
1201 
1202  ret+=(TreeMem[tree].weight) ;
1203 
1204  for (int32_t i=0; i<4; i++)
1205  if (TreeMem[tree].children[i]!=NO_CHILD)
1206  ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
1207 
1208  return ret ;
1209 }
1210 
1211 
1212  template <class Trie>
1214 {
1215  float64_t * sum=SG_MALLOC(float64_t, length*4);
1216  for (int32_t i=0; i<length*4; i++)
1217  sum[i]=0 ;
1218  len=length ;
1219 
1220  for (int32_t i=0; i<length; i++)
1221  {
1222  TRIE_ASSERT(trees[i]!=NO_CHILD)
1223  for (int32_t k=0; k<4; k++)
1224  {
1225  sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
1226  }
1227  }
1228 
1229  return sum ;
1230 }
1231 
1232  template <class Trie>
1234  int32_t tree, int32_t i, float64_t alpha,
1235  int32_t *vec, int32_t len_rem,
1236  int32_t degree_rec, int32_t mismatch_rec,
1237  int32_t max_mismatch, float64_t * weights)
1238 {
1239  if (tree==NO_CHILD)
1240  tree=trees[i] ;
1241  TRIE_ASSERT(tree!=NO_CHILD)
1242 
1243  if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
1244  return ;
1245  const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
1246 
1247  int32_t subtree = NO_CHILD ;
1248 
1249  if (degree_rec==degree-1)
1250  {
1251  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1252  if (weights_in_tree)
1253  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
1254  else
1255  if (weights[degree_rec]!=0.0)
1256  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1257  if (mismatch_rec+1<=max_mismatch)
1258  for (int32_t o=0; o<3; o++)
1259  {
1260  if (weights_in_tree)
1261  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1262  else
1263  if (weights[degree_rec]!=0.0)
1264  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1265  }
1266  return ;
1267  }
1268  else
1269  {
1270  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1271  if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
1272  {
1273  subtree=TreeMem[tree].children[vec[0]] ;
1274  if (weights_in_tree)
1275  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
1276  else
1277  if (weights[degree_rec]!=0.0)
1278  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1279  }
1280  else
1281  {
1282  int32_t tmp = get_node(degree_rec==degree-2);
1283  ASSERT(tmp>=0)
1284  TreeMem[tree].children[vec[0]]=tmp ;
1285  subtree=tmp ;
1286 #ifdef TRIE_CHECK_EVERYTHING
1287  if (degree_rec==degree-2)
1288  TreeMem[subtree].has_floats=true ;
1289 #endif
1290  if (weights_in_tree)
1291  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
1292  else
1293  if (weights[degree_rec]!=0.0)
1294  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
1295  else
1296  TreeMem[subtree].weight = 0.0 ;
1297  }
1298  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1299  &vec[1], len_rem-1,
1300  degree_rec+1, mismatch_rec, max_mismatch, weights) ;
1301 
1302  if (mismatch_rec+1<=max_mismatch)
1303  {
1304  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1305  for (int32_t o=0; o<3; o++)
1306  {
1307  int32_t ot = other[vec[0]][o] ;
1308  if (TreeMem[tree].children[ot]!=NO_CHILD)
1309  {
1310  subtree=TreeMem[tree].children[ot] ;
1311  if (weights_in_tree)
1312  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1313  else
1314  if (weights[degree_rec]!=0.0)
1315  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1316  }
1317  else
1318  {
1319  int32_t tmp = get_node(degree_rec==degree-2);
1320  ASSERT(tmp>=0)
1321  TreeMem[tree].children[ot]=tmp ;
1322  subtree=tmp ;
1323 #ifdef TRIE_CHECK_EVERYTHING
1324  if (degree_rec==degree-2)
1325  TreeMem[subtree].has_floats=true ;
1326 #endif
1327 
1328  if (weights_in_tree)
1329  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
1330  else
1331  if (weights[degree_rec]!=0.0)
1332  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
1333  else
1334  TreeMem[subtree].weight = 0.0 ;
1335  }
1336 
1337  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1338  &vec[1], len_rem-1,
1339  degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
1340  }
1341  }
1342  }
1343 }
1344 
1345  template <class Trie>
1347  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
1348  int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
1349  int32_t offs, float64_t* result)
1350 {
1351  if (i+j<num_feat)
1352  {
1353  float64_t decay=1.0; //no decay by default
1354  //if (j>d)
1355  // decay=pow(0.5,j); //marginalize out lower order matches
1356 
1357  if (j<degree-1)
1358  {
1359  for (int32_t k=0; k<num_sym; k++)
1360  {
1361  if (TreeMem[tree].children[k]!=NO_CHILD)
1362  {
1363  int32_t child=TreeMem[tree].children[k];
1364  //continue recursion if not yet at max_degree, else add to result
1365  if (d<max_degree-1)
1366  compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1367  else
1368  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
1369 
1371  if (d==0)
1372  compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
1373  }
1374  }
1375  }
1376  else if (j==degree-1)
1377  {
1378  for (int32_t k=0; k<num_sym; k++)
1379  {
1380  //continue recursion if not yet at max_degree, else add to result
1381  if (d<max_degree-1 && i<num_feat-1)
1382  compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1383  else
1384  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
1385  }
1386  }
1387  }
1388 }
1389 
1390  template <class Trie>
1392  int32_t tree, const int32_t p, struct TreeParseInfo info,
1393  const int32_t depth, int32_t* const x, const int32_t k)
1394 {
1395  const int32_t num_sym = info.num_sym;
1396  const int32_t y0 = info.y0;
1397  const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
1398  //const int32_t temp = info.substrs[depth]*num_sym - ( (depth<=k) ? 0 : info.nofsKmers[k] * x[depth-k-1] );
1399  //if( !( info.y0 == temp ) ) {
1400  // printf( "\n temp=%d y0=%d k=%d depth=%d \n", temp, info.y0, k, depth );
1401  //}
1402  //ASSERT( info.y0 == temp )
1403  int32_t sym;
1404  ASSERT( depth < degree )
1405  //ASSERT( 0 <= info.substrs[depth] && info.substrs[depth] < info.nofsKmers[k] )
1406  if (depth<degree-1)
1407  {
1408  for( sym=0; sym<num_sym; ++sym ) {
1409  const int32_t childNum = TreeMem[tree].children[ sym ];
1410  if( childNum != NO_CHILD ) {
1411  int32_t child = childNum ;
1412  x[depth] = sym;
1413  info.substrs[depth+1] = y0 + sym;
1414  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1415  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1416  count( TreeMem[child].weight, depth, info, p, x, k );
1417  traverse( child, p, info, depth+1, x, k );
1418  x[depth] = -1;
1419  }
1420  }
1421  }
1422  else if( depth == degree-1 )
1423  {
1424  for( sym=0; sym<num_sym; ++sym ) {
1425  const float64_t w = TreeMem[tree].child_weights[ sym ];
1426  if( w != 0.0 ) {
1427  x[depth] = sym;
1428  info.substrs[depth+1] = y0 + sym;
1429  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1430  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1431  count( w, depth, info, p, x, k );
1432  x[depth] = -1;
1433  }
1434  }
1435  }
1436  //info.substrs[depth+1] = -1;
1437  //info.y0 = temp;
1438 }
1439 
1440  template <class Trie>
1442  const float64_t w, const int32_t depth, const struct TreeParseInfo info,
1443  const int32_t p, int32_t* x, const int32_t k)
1444 {
1445  ASSERT( fabs(w) < 1e10 )
1446  ASSERT( x[depth] >= 0 )
1447  ASSERT( x[depth+1] < 0 )
1448  if ( depth < k ) {
1449  return;
1450  }
1451  //ASSERT( info.margFactors[ depth-k ] == pow( 0.25, depth-k ) )
1452  const int32_t nofKmers = info.nofsKmers[k];
1453  const float64_t margWeight = w * info.margFactors[ depth-k ];
1454  const int32_t m_a = depth - k + 1;
1455  const int32_t m_b = info.num_feat - p;
1456  const int32_t m = ( m_a < m_b ) ? m_a : m_b;
1457  // all proper k-substrings
1458  const int32_t offset0 = nofKmers * p;
1459  register int32_t i;
1460  register int32_t offset;
1461  offset = offset0;
1462  for( i = 0; i < m; ++i ) {
1463  const int32_t y = info.substrs[i+k+1];
1464  info.C_k[ y + offset ] += margWeight;
1465  offset += nofKmers;
1466  }
1467  if( depth > k ) {
1468  // k-prefix
1469  const int32_t offsR = info.substrs[k+1] + offset0;
1470  info.R_k[offsR] += margWeight;
1471  // k-suffix
1472  if( p+depth-k < info.num_feat ) {
1473  const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
1474  info.L_k[offsL] += margWeight;
1475  }
1476  }
1477  // # N.x = substring represented by N
1478  // # N.d = length of N.x
1479  // # N.s = starting position of N.x
1480  // # N.w = weight for feature represented by N
1481  // if( N.d >= k )
1482  // margContrib = w / 4^(N.d-k)
1483  // for i = 1 to (N.d-k+1)
1484  // y = N.x[i:(i+k-1)] # overlapped k-mer
1485  // C_k[ N.s+i-1, y ] += margContrib
1486  // end;
1487  // if( N.d > k )
1488  // L_k[ N.s+N.d-k, N.x[N.d-k+(1:k)] ] += margContrib # j-suffix of N.x
1489  // R_k[ N.s, N.x[1:k] ] += margContrib # j-prefix of N.x
1490  // end;
1491  // end;
1492 }
1493 
1494  template <class Trie>
1496  int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
1497  float64_t *weights, bool degree_times_position_weights)
1498 {
1499  int32_t tree = trees[i] ;
1500  //ASSERT(seq_offset==0)
1501 
1502  int32_t max_depth = 0 ;
1503  float64_t* weights_column ;
1504  if (degree_times_position_weights)
1505  weights_column = &weights[(i+seq_offset)*degree] ;
1506  else
1507  weights_column = weights ;
1508 
1509  if (weights_in_tree)
1510  {
1511  for (int32_t j=0; (j<degree) && (i+j<length); j++)
1512  if (CMath::abs(weights_column[j]*alpha)>0)
1513  max_depth = j+1 ;
1514  }
1515  else
1516  // don't use the weights
1517  max_depth=degree ;
1518 
1519  for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
1520  {
1521  TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4))
1522  if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
1523  {
1524  if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
1525  {
1526  // special treatment of the next nodes
1527  TRIE_ASSERT(j >= degree-16)
1528  // get the right element
1529  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1530  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1531  int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
1532 
1533  TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax))
1534  TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq)
1535  TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats)
1536 
1537  // check whether the same string is stored
1538  int32_t mismatch_pos = -1 ;
1539  {
1540  int32_t k ;
1541  for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
1542  {
1543  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1544  // ###
1545  if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
1546  fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
1547  TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER))
1548  TRIE_ASSERT(k<16)
1549  if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
1550  {
1551  mismatch_pos=k ;
1552  break ;
1553  }
1554  }
1555  }
1556 
1557  // what happens when the .seq sequence is longer than vec? should we branch???
1558 
1559  if (mismatch_pos==-1)
1560  // if so, then just increase the weight by alpha and stop
1561  TreeMem[node].weight+=alpha ;
1562  else
1563  // otherwise
1564  // 1. replace current node with new node
1565  // 2. create new nodes until mismatching positon
1566  // 2. add a branch with old string (old node) and the new string (new node)
1567  {
1568  // replace old node
1569  int32_t last_node=tree ;
1570 
1571  // create new nodes until mismatch
1572  int32_t k ;
1573  for (k=0; k<mismatch_pos; k++)
1574  {
1575  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1576  TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k])
1577 
1578  int32_t tmp=get_node();
1579  TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
1580  last_node=tmp ;
1581  if (weights_in_tree)
1582  TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
1583  else
1584  TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
1585  TRIE_ASSERT(j+k!=degree-1)
1586  }
1587  if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
1588  fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
1589  ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER))
1590  TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos])
1591 
1592  if (j+k==degree-1)
1593  {
1594  // init child weights with zero if after dropping out
1595  // of the k<mismatch_pos loop we are one level below degree
1596  // (keep this even after get_node() change!)
1597  for (int32_t q=0; q<4; q++)
1598  TreeMem[last_node].child_weights[q]=0.0 ;
1599  if (weights_in_tree)
1600  {
1601  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1602  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
1603  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
1604  }
1605  else
1606  {
1607  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1608  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
1609  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
1610  }
1611 
1612 #ifdef TRIE_CHECK_EVERYTHING
1613  TreeMem[last_node].has_floats=true ;
1614 #endif
1615  }
1616  else
1617  {
1618  // the branch for the existing string
1619  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1620  {
1621  TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
1622 
1623  // move string by mismatch_pos positions
1624  for (int32_t q=0; q<16; q++)
1625  {
1626  if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
1627  TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
1628  else
1629  TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
1630  }
1631 #ifdef TRIE_CHECK_EVERYTHING
1632  TreeMem[node].has_seq=true ;
1633 #endif
1634  }
1635 
1636  // the new branch
1637  TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4))
1638  int32_t tmp = get_node() ;
1639  TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
1640  last_node=tmp ;
1641 
1642  TreeMem[last_node].weight = alpha ;
1643 #ifdef TRIE_CHECK_EVERYTHING
1644  TreeMem[last_node].has_seq = true ;
1645 #endif
1646  memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1647  for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
1648  TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
1649  }
1650  }
1651  break ;
1652  }
1653  else
1654  {
1655  tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
1656  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1657  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1658  if (weights_in_tree)
1659  TreeMem[tree].weight += alpha*weights_column[j];
1660  else
1661  TreeMem[tree].weight += alpha ;
1662  }
1663  }
1664  else if (j==degree-1)
1665  {
1666  // special treatment of the last node
1667  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1668  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1669  if (weights_in_tree)
1670  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
1671  else
1672  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
1673 
1674  break;
1675  }
1676  else
1677  {
1678  bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
1679  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1680  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1681 
1682  int32_t tmp = get_node((j==degree-2) && (!use_seq));
1683  if (use_seq)
1684  TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
1685  else
1686  TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
1687  tree=tmp ;
1688 
1689  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1690 #ifdef TRIE_CHECK_EVERYTHING
1691  TreeMem[tree].has_seq = use_seq ;
1692 #endif
1693  if (use_seq)
1694  {
1695  TreeMem[tree].weight = alpha ;
1696  // important to have the terminal characters (see ###)
1697  memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1698  for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
1699  {
1700  TRIE_ASSERT(q<16)
1701  TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
1702  }
1703  break ;
1704  }
1705  else
1706  {
1707  if (weights_in_tree)
1708  TreeMem[tree].weight = alpha*weights_column[j] ;
1709  else
1710  TreeMem[tree].weight = alpha ;
1711 #ifdef TRIE_CHECK_EVERYTHING
1712  if (j==degree-2)
1713  TreeMem[tree].has_floats = true ;
1714 #endif
1715  }
1716  }
1717  }
1718 }
1719 
1720  template <class Trie>
1722  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1723  int32_t weight_pos, float64_t* weights,
1724  bool degree_times_position_weights)
1725 {
1726  int32_t tree = trees[tree_pos] ;
1727 
1728  if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
1729  return 0.0;
1730 
1731  float64_t *weights_column=NULL ;
1732  if (degree_times_position_weights)
1733  weights_column=&weights[weight_pos*degree] ;
1734  else // weights is a vector (1 x degree)
1735  weights_column=weights ;
1736 
1737  float64_t sum=0 ;
1738  for (int32_t j=0; seq_pos+j < len; j++)
1739  {
1740  TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0))
1741 
1742  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1743  {
1744  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1745  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1746  {
1747  tree = - TreeMem[tree].children[vec[seq_pos+j]];
1748  TRIE_ASSERT(tree>=0)
1749  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1750  float64_t this_weight=0.0 ;
1751  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1752  {
1753  TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0))
1754  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1755  break ;
1756  this_weight += weights_column[j+k] ;
1757  }
1758  sum += TreeMem[tree].weight * this_weight ;
1759  break ;
1760  }
1761  else
1762  {
1763  tree=TreeMem[tree].children[vec[seq_pos+j]];
1764  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1765  if (weights_in_tree)
1766  sum += TreeMem[tree].weight ;
1767  else
1768  sum += TreeMem[tree].weight * weights_column[j] ;
1769  } ;
1770  }
1771  else
1772  {
1773  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1774  if (j==degree-1)
1775  {
1776  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1777  if (weights_in_tree)
1778  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1779  else
1780  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
1781  }
1782  else
1783  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1784 
1785  break;
1786  }
1787  }
1788 
1789  if (position_weights!=NULL)
1790  return sum*position_weights[weight_pos] ;
1791  else
1792  return sum ;
1793 }
1794 
1795  template <class Trie>
1797  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1798  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
1799  int32_t mkl_stepsize, float64_t * weights,
1800  bool degree_times_position_weights)
1801 {
1802  int32_t tree = trees[tree_pos] ;
1803  if (factor==0)
1804  return ;
1805 
1806  if (position_weights!=NULL)
1807  {
1808  factor *= position_weights[weight_pos] ;
1809  if (factor==0)
1810  return ;
1811  if (!degree_times_position_weights) // with position_weigths, weights is a vector (1 x degree)
1812  {
1813  for (int32_t j=0; seq_pos+j<len; j++)
1814  {
1815  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1816  {
1817  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1818  {
1819  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1820  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1821  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1822  {
1823  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1824  break ;
1825  if (weights_in_tree)
1826  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1827  else
1828  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1829  }
1830  break ;
1831  }
1832  else
1833  {
1834  tree=TreeMem[tree].children[vec[seq_pos+j]];
1835  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1836  if (weights_in_tree)
1837  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1838  else
1839  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1840  }
1841  }
1842  else
1843  {
1844  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1845  if (j==degree-1)
1846  {
1847  if (weights_in_tree)
1848  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1849  else
1850  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1851  }
1852  }
1853  }
1854  }
1855  else // with position_weigths, weights is a matrix (len x degree)
1856  {
1857  for (int32_t j=0; seq_pos+j<len; j++)
1858  {
1859  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1860  {
1861  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1862  {
1863  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1864  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1865  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1866  {
1867  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1868  break ;
1869  if (weights_in_tree)
1870  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1871  else
1872  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1873  }
1874  break ;
1875  }
1876  else
1877  {
1878  tree=TreeMem[tree].children[vec[seq_pos+j]];
1879  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1880  if (weights_in_tree)
1881  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1882  else
1883  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
1884  }
1885  }
1886  else
1887  {
1888  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1889  if (j==degree-1)
1890  {
1891  if (weights_in_tree)
1892  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1893  else
1894  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
1895  }
1896  break ;
1897  }
1898  }
1899  }
1900  }
1901  else if (!degree_times_position_weights) // no position_weigths, weights is a vector (1 x degree)
1902  {
1903  for (int32_t j=0; seq_pos+j<len; j++)
1904  {
1905  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1906  {
1907  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1908  {
1909  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1910  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1911  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1912  {
1913  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1914  break ;
1915  if (weights_in_tree)
1916  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1917  else
1918  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1919  }
1920  break ;
1921  }
1922  else
1923  {
1924  tree=TreeMem[tree].children[vec[seq_pos+j]];
1925  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1926  if (weights_in_tree)
1927  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
1928  else
1929  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1930  }
1931  }
1932  else
1933  {
1934  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1935  if (j==degree-1)
1936  {
1937  if (weights_in_tree)
1938  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1939  else
1940  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1941  }
1942  break ;
1943  }
1944  }
1945  }
1946  else // no position_weigths, weights is a matrix (len x degree)
1947  {
1948  /*if (!position_mask)
1949  {
1950  position_mask = SG_MALLOC(bool, len);
1951  for (int32_t i=0; i<len; i++)
1952  {
1953  position_mask[i]=false ;
1954 
1955  for (int32_t j=0; j<degree; j++)
1956  if (weights[i*degree+j]!=0.0)
1957  {
1958  position_mask[i]=true ;
1959  break ;
1960  }
1961  }
1962  }
1963  if (position_mask[weight_pos]==0)
1964  return ;*/
1965 
1966  for (int32_t j=0; seq_pos+j<len; j++)
1967  {
1968  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1969  {
1970  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1971  {
1972  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1973  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1974  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1975  {
1976  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1977  break ;
1978  if (weights_in_tree)
1979  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1980  else
1981  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1982  }
1983  break ;
1984  }
1985  else
1986  {
1987  tree=TreeMem[tree].children[vec[seq_pos+j]];
1988  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1989  if (weights_in_tree)
1990  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
1991  else
1992  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
1993  }
1994  }
1995  else
1996  {
1997  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1998  if (j==degree-1)
1999  {
2000  if (weights_in_tree)
2001  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
2002  else
2003  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
2004  }
2005  break ;
2006  }
2007  }
2008  }
2009 }
2010 
2011  template <class Trie>
2013  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
2014  DynArray<ConsensusEntry>* table, float64_t* weights)
2015 {
2016  float64_t w=1.0;
2017 
2018  if (weights_in_tree || depth==0)
2019  value+=tree->weight;
2020  else
2021  {
2022  w=weights[degree-1];
2023  value+=weights[depth-1]*tree->weight;
2024  }
2025 
2026  if (degree-1==depth)
2027  {
2028  for (int32_t sym=0; sym<4; sym++)
2029  {
2030  float64_t v=w*tree->child_weights[sym];
2031  if (v!=0.0)
2032  {
2033  ConsensusEntry entry;
2034  entry.bt=-1;
2035  entry.score=value+v;
2036  entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2037 
2038  table->append_element(entry);
2039  }
2040  }
2041  }
2042  else
2043  {
2044  for (int32_t sym=0; sym<4; sym++)
2045  {
2046  uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2047  if (tree->children[sym] != NO_CHILD)
2048  fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
2049  }
2050  }
2051 }
2052 
2053  template <class Trie>
2055  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
2056 {
2057  float64_t result=0.0;
2058 
2059  //SG_PRINT("pos:%i length:%i deg:%i seq:0x%0llx...\n", pos, length, deg, seq)
2060 
2061  for (int32_t i=pos; i<pos+deg && i<length; i++)
2062  {
2063  //SG_PRINT("loop %d\n", i)
2064  Trie* tree = &TreeMem[trees[i]];
2065 
2066  for (int32_t d=0; d<deg-i+pos; d++)
2067  {
2068  //SG_PRINT("loop degree %d shit: %d\n", d, (2*(deg-1-d-i+pos)))
2069  ASSERT(d-1<degree)
2070  int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
2071 
2072  float64_t w=1.0;
2073  if (!weights_in_tree)
2074  w=weights[d];
2075 
2076  ASSERT(tree->children[sym] != NO_CHILD)
2077  tree=&TreeMem[tree->children[sym]];
2078  result+=w*tree->weight;
2079  }
2080  }
2081  //SG_PRINT("cum: %f\n", result)
2082  return result;
2083 }
2084 
2085  template <class Trie>
2087  int32_t pos, DynArray<ConsensusEntry>* prev,
2088  DynArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
2089 {
2090  ASSERT(pos>=0 && pos<length)
2091  ASSERT(!use_compact_terminal_nodes)
2092 
2093  Trie* t = &TreeMem[trees[pos]];
2094 
2095  fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
2096 
2097 
2098  if (cumulative)
2099  {
2100  int32_t num_cur=cur->get_num_elements();
2101  for (int32_t i=0; i<num_cur; i++)
2102  {
2103  ConsensusEntry entry=cur->get_element(i);
2104  entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
2105  cur->set_element(entry,i);
2106  //SG_PRINT("cum: str:0%0llx sc:%f bt:%d\n",entry.string,entry.score,entry.bt)
2107  }
2108  }
2109 
2110  //if previous tree exists find maximum scoring path
2111  //for each element in cur and update bt table
2112  if (prev)
2113  {
2114  int32_t num_cur=cur->get_num_elements();
2115  int32_t num_prev=prev->get_num_elements();
2116 
2117  for (int32_t i=0; i<num_cur; i++)
2118  {
2119  //uint64_t str_cur_old= cur->get_element(i).string;
2120  uint64_t str_cur= cur->get_element(i).string >> 2;
2121  //SG_PRINT("...cur:0x%0llx cur_noprfx:0x%0llx...\n", str_cur_old, str_cur)
2122 
2123  int32_t bt=-1;
2124  float64_t max_score=0.0;
2125 
2126  for (int32_t j=0; j<num_prev; j++)
2127  {
2128  //uint64_t str_prev_old= prev->get_element(j).string;
2129  uint64_t mask=
2130  ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
2131  uint64_t str_prev= mask & prev->get_element(j).string;
2132  //SG_PRINT("...prev:0x%0llx prev_nosfx:0x%0llx mask:%0llx...\n", str_prev_old, str_prev,mask)
2133 
2134  if (str_cur == str_prev)
2135  {
2136  float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
2137  if (bt==-1 || sc>max_score)
2138  {
2139  bt=j;
2140  max_score=sc;
2141 
2142  //SG_PRINT("new_max[%i,%i] = %f\n", j,i, max_score)
2143  }
2144  }
2145  }
2146 
2147  ASSERT(bt!=-1)
2148  ConsensusEntry entry;
2149  entry.bt=bt;
2150  entry.score=max_score;
2151  entry.string=cur->get_element(i).string;
2152  cur->set_element(entry, i);
2153  //SG_PRINT("entry[%d]: str:0%0llx sc:%f bt:%d\n",i, entry.string,entry.score,entry.bt)
2154  }
2155  }
2156 }
2157 }
2158 #endif // _TRIE_H___

SHOGUN Machine Learning Toolbox - Documentation