SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Trie.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2009 Gunnar Raetsch
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
12 #ifndef _TRIE_H___
13 #define _TRIE_H___
14 
15 #include <shogun/lib/config.h>
16 
17 #include <string.h>
18 #include <shogun/lib/common.h>
19 #include <shogun/io/SGIO.h>
20 #include <shogun/base/DynArray.h>
22 #include <shogun/base/SGObject.h>
23 
24 namespace shogun
25 {
26 #ifndef DOXYGEN_SHOULD_SKIP_THIS
27 
28 // sentinel is 0xFFFFFFFC or float -2
29 #define NO_CHILD ((int32_t)-1073741824)
30 
31 #define WEIGHTS_IN_TRIE
32 //#define TRIE_CHECK_EVERYTHING
33 
34 #ifdef TRIE_CHECK_EVERYTHING
35 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
36 #else
37 #define TRIE_ASSERT_EVERYTHING(x)
38 #endif
39 
40 //#define TRIE_ASSERT(x) ASSERT(x)
41 #define TRIE_ASSERT(x)
42 
43 #define TRIE_TERMINAL_CHARACTER 7
44 
46 struct ConsensusEntry
47 {
49  uint64_t string;
51  float32_t score;
53  int32_t bt;
54 };
55 
57 struct POIMTrie
58 {
60  float64_t weight;
61 #ifdef TRIE_CHECK_EVERYTHING
62 
63  bool has_seq;
65  bool has_floats;
66 #endif
67  union
68  {
70  float32_t child_weights[4];
72  int32_t children[4];
74  uint8_t seq[16] ;
75  };
76 
78  float64_t S;
80  float64_t L;
82  float64_t R;
83 };
84 
86 struct DNATrie
87 {
89  float64_t weight;
90 #ifdef TRIE_CHECK_EVERYTHING
91 
92  bool has_seq;
94  bool has_floats;
95 #endif
96  union
97  {
99  float32_t child_weights[4];
101  int32_t children[4];
103  uint8_t seq[16] ;
104  };
105 };
106 
108 struct TreeParseInfo {
110  int32_t num_sym;
112  int32_t num_feat;
114  int32_t p;
116  int32_t k;
118  int32_t* nofsKmers;
120  float64_t* margFactors;
122  int32_t* x;
124  int32_t* substrs;
126  int32_t y0;
128  float64_t* C_k;
130  float64_t* L_k;
132  float64_t* R_k;
133 };
134 
135 #endif // DOXYGEN_SHOULD_SKIP_THIS
136 
137 template <class Trie> class CTrie;
138 
139 #define IGNORE_IN_CLASSLIST
140 
158 IGNORE_IN_CLASSLIST template <class Trie> class CTrie : public CSGObject
159 {
160  public:
162  CTrie();
163 
170  CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
171 
173  CTrie(const CTrie & to_copy);
174  virtual ~CTrie();
175 
177  const CTrie & operator=(const CTrie & to_copy);
178 
186  bool compare_traverse(
187  int32_t node, const CTrie & other, int32_t other_node);
188 
194  bool compare(const CTrie & other);
195 
202  bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
203 
210  int32_t find_deepest_node(
211  int32_t start_node, int32_t &deepest_node) const;
212 
217  void display_node(int32_t node) const;
218 
220  void destroy();
221 
226  void set_degree(int32_t d);
227 
234  void create(int32_t len, bool p_use_compact_terminal_nodes=true);
235 
241  void delete_trees(bool p_use_compact_terminal_nodes=true);
242 
253  void add_to_trie(
254  int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
255  float64_t *weights, bool degree_times_position_weights);
256 
263  float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
264 
270  float64_t* compute_abs_weights(int32_t &len);
271 
285  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
286  int32_t weight_pos, float64_t * weights,
287  bool degree_times_position_weights) ;
288 
304  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
305  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
306  int32_t mkl_stepsize, float64_t * weights,
307  bool degree_times_position_weights);
308 
324  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
325  int32_t max_degree, int32_t num_feat, int32_t num_sym,
326  int32_t sym_offset, int32_t offs, float64_t* result);
327 
341  int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
342  int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
343  int32_t max_mismatch, float64_t * weights);
344 
354  void traverse(
355  int32_t tree, const int32_t p, struct TreeParseInfo info,
356  const int32_t depth, int32_t* const x, const int32_t k);
357 
367  void count(
368  const float64_t w, const int32_t depth,
369  const struct TreeParseInfo info, const int32_t p, int32_t* x,
370  const int32_t k);
371 
378  int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
379 
389  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
390 
401  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
402  DynArray<ConsensusEntry>* table, float64_t* weights);
403 
413  int32_t pos, DynArray<ConsensusEntry>* prev,
414  DynArray<ConsensusEntry>* cur, bool cumulative,
415  float64_t* weights);
416 
422  void POIMs_extract_W(float64_t* const* const W, const int32_t K);
423 
428  void POIMs_precalc_SLR(const float64_t* const distrib);
429 
439  void POIMs_get_SLR(
440  const int32_t parentIdx, const int32_t sym, const int32_t depth,
441  float64_t* S, float64_t* L, float64_t* R);
442 
449  void POIMs_add_SLR(
450  float64_t* const* const poims, const int32_t K,
451  const int32_t debug);
452 
458  {
460  }
461 
468  bool p_use_compact_terminal_nodes)
469  {
470  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
471  }
472 
477  inline int32_t get_num_used_nodes()
478  {
479  return TreeMemPtr;
480  }
481 
486  inline void set_position_weights(float64_t* p_position_weights)
487  {
488  position_weights=p_position_weights;
489  }
490 
495  inline int32_t get_node(bool last_node=false)
496  {
497  int32_t ret = TreeMemPtr++;
498  check_treemem() ;
499 
500  if (last_node)
501  {
502  for (int32_t q=0; q<4; q++)
503  TreeMem[ret].child_weights[q]=0.0;
504  }
505  else
506  {
507  for (int32_t q=0; q<4; q++)
508  TreeMem[ret].children[q]=NO_CHILD;
509  }
510 #ifdef TRIE_CHECK_EVERYTHING
511  TreeMem[ret].has_seq=false ;
512  TreeMem[ret].has_floats=false ;
513 #endif
514  TreeMem[ret].weight=0.0;
515  return ret ;
516  }
517 
519  inline void check_treemem()
520  {
521  if (TreeMemPtr+10 < TreeMemPtrMax)
522  return;
523  SG_DEBUG("Extending TreeMem from %i to %i elements\n",
524  TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
525  int32_t old_sz=TreeMemPtrMax;
526  TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
527  TreeMem = SG_REALLOC(Trie, TreeMem, old_sz, TreeMemPtrMax);
528  }
529 
534  inline void set_weights_in_tree(bool weights_in_tree_)
535  {
536  weights_in_tree = weights_in_tree_;
537  }
538 
543  inline bool get_weights_in_tree()
544  {
545  return weights_in_tree;
546  }
547 
558  const int32_t nodeIdx, const int32_t depth, const int32_t offset,
559  const int32_t y0, float64_t* const* const W, const int32_t K);
560 
574  const float64_t* const distrib, const int32_t i,
575  const int32_t nodeIdx, int32_t left_tries_idx[4],
576  const int32_t depth, int32_t const lastSym, float64_t* S,
577  float64_t* L, float64_t* R);
578 
579 
591  const float64_t* const distrib, const int32_t i,
592  const int32_t nodeIdx, int32_t left_tries_idx[4],
593  const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
594 
606  const int32_t nodeIdx, const int32_t depth,const int32_t i,
607  const int32_t y0, float64_t* const* const poims, const int32_t K,
608  const int32_t debug);
609 
624  float64_t* const* const poims, const int32_t K, const int32_t k,
625  const int32_t i, const int32_t y, const float64_t valW,
626  const float64_t valS, const float64_t valL, const float64_t valR,
627  const int32_t debug);
628 
630  virtual const char* get_name() const { return "Trie"; }
631 
632  public:
634  int32_t NUM_SYMS;
635 
636  protected:
638  int32_t length;
640  int32_t * trees;
641 
643  int32_t degree;
646 
648  Trie* TreeMem;
650  int32_t TreeMemPtr;
652  int32_t TreeMemPtrMax;
655 
658 
660  int32_t* nofsKmers;
661 };
662  template <class Trie>
664  : CSGObject(), degree(0), position_weights(NULL),
665  use_compact_terminal_nodes(false),
666  weights_in_tree(true)
667  {
668 
669  TreeMemPtrMax=0;
670  TreeMemPtr=0;
671  TreeMem=NULL;
672 
673  length=0;
674  trees=NULL;
675 
676  NUM_SYMS=4;
677  }
678 
679  template <class Trie>
680  CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
681  : CSGObject(), degree(d), position_weights(NULL),
682  use_compact_terminal_nodes(p_use_compact_terminal_nodes),
683  weights_in_tree(true)
684  {
685  TreeMemPtrMax=1024*1024/sizeof(Trie);
686  TreeMemPtr=0;
687  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
688 
689  length=0;
690  trees=NULL;
691 
692  NUM_SYMS=4;
693  }
694 
695  template <class Trie>
696  CTrie<Trie>::CTrie(const CTrie & to_copy)
697  : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
698  use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
699  {
700  if (to_copy.position_weights!=NULL)
701  {
703  /*SG_MALLOC(float64_t, to_copy.length);
704  for (int32_t i=0; i<to_copy.length; i++)
705  position_weights[i]=to_copy.position_weights[i]; */
706  }
707  else
708  position_weights=NULL;
709 
711  TreeMemPtr=to_copy.TreeMemPtr;
712  TreeMem=SG_MALLOC(Trie, TreeMemPtrMax);
713  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
714 
715  length=to_copy.length;
716  trees=SG_MALLOC(int32_t, length);
717  for (int32_t i=0; i<length; i++)
718  trees[i]=to_copy.trees[i];
719 
720  NUM_SYMS=4;
721  }
722 
723  template <class Trie>
725 {
726  degree=to_copy.degree ;
727  use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
728 
729  SG_FREE(position_weights);
730  position_weights=NULL ;
731  if (to_copy.position_weights!=NULL)
732  {
733  position_weights=to_copy.position_weights ;
734  /*position_weights = SG_MALLOC(float64_t, to_copy.length);
735  for (int32_t i=0; i<to_copy.length; i++)
736  position_weights[i]=to_copy.position_weights[i] ;*/
737  }
738  else
739  position_weights=NULL ;
740 
741  TreeMemPtrMax=to_copy.TreeMemPtrMax ;
742  TreeMemPtr=to_copy.TreeMemPtr ;
743  SG_FREE(TreeMem) ;
744  TreeMem = SG_MALLOC(Trie, TreeMemPtrMax);
745  memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
746 
747  length = to_copy.length ;
748  if (trees)
749  SG_FREE(trees);
750  trees=SG_MALLOC(int32_t, length);
751  for (int32_t i=0; i<length; i++)
752  trees[i]=to_copy.trees[i] ;
753 
754  return *this ;
755 }
756 
757 template <class Trie>
759  int32_t start_node, int32_t& deepest_node) const
760 {
761 #ifdef TRIE_CHECK_EVERYTHING
762  int32_t ret=0 ;
763  SG_DEBUG("start_node=%i\n", start_node)
764 
765  if (start_node==NO_CHILD)
766  {
767  for (int32_t i=0; i<length; i++)
768  {
769  int32_t my_deepest_node ;
770  int32_t depth=find_deepest_node(i, my_deepest_node) ;
771  SG_DEBUG("start_node %i depth=%i\n", i, depth)
772  if (depth>ret)
773  {
774  deepest_node=my_deepest_node ;
775  ret=depth ;
776  }
777  }
778  return ret ;
779  }
780 
781  if (TreeMem[start_node].has_seq)
782  {
783  for (int32_t q=0; q<16; q++)
784  if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
785  ret++ ;
786  deepest_node=start_node ;
787  return ret ;
788  }
789  if (TreeMem[start_node].has_floats)
790  {
791  deepest_node=start_node ;
792  return 1 ;
793  }
794 
795  for (int32_t q=0; q<4; q++)
796  {
797  int32_t my_deepest_node ;
798  if (TreeMem[start_node].children[q]==NO_CHILD)
799  continue ;
800  int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
801  if (depth>ret)
802  {
803  deepest_node=my_deepest_node ;
804  ret=depth ;
805  }
806  }
807  return ret ;
808 #else
809  SG_ERROR("not implemented\n")
810  return 0 ;
811 #endif
812 }
813 
814  template <class Trie>
816  int32_t start_node, int32_t depth, float64_t * weights)
817 {
818  SG_ERROR("code buggy\n")
819 
820  int32_t ret=0 ;
821 
822  if (start_node==NO_CHILD)
823  {
824  for (int32_t i=0; i<length; i++)
825  compact_nodes(i,1, weights) ;
826  return 0 ;
827  }
828  if (start_node<0)
829  return -1 ;
830 
831  if (depth==degree-1)
832  {
833  TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats)
834  int32_t num_used=0 ;
835  for (int32_t q=0; q<4; q++)
836  if (TreeMem[start_node].child_weights[q]!=0.0)
837  num_used++ ;
838  if (num_used>1)
839  return -1 ;
840  return 1 ;
841  }
842  TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats)
843 
844  int32_t num_used = 0 ;
845  int32_t q_used=-1 ;
846 
847  for (int32_t q=0; q<4; q++)
848  {
849  if (TreeMem[start_node].children[q]==NO_CHILD)
850  continue ;
851  num_used++ ;
852  q_used=q ;
853  }
854  if (num_used>1)
855  {
856  if (depth>=degree-2)
857  return -1 ;
858  for (int32_t q=0; q<4; q++)
859  {
860  if (TreeMem[start_node].children[q]==NO_CHILD)
861  continue ;
862  int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
863  if (num<=2)
864  continue ;
865  int32_t node=get_node() ;
866 
867  int32_t last_node=TreeMem[start_node].children[q] ;
868  if (weights_in_tree)
869  {
870  ASSERT(weights[depth]!=0.0)
871  TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
872  }
873  else
874  TreeMem[node].weight=TreeMem[last_node].weight ;
875 
876 #ifdef TRIE_CHECK_EVERYTHING
877  TreeMem[node].has_seq=true ;
878 #endif
879  memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
880  for (int32_t n=0; n<num; n++)
881  {
882  ASSERT(depth+n+1<=degree-1)
883  ASSERT(last_node!=NO_CHILD)
884  if (depth+n+1==degree-1)
885  {
886  TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats)
887  int32_t k ;
888  for (k=0; k<4; k++)
889  if (TreeMem[last_node].child_weights[k]!=0.0)
890  break ;
891  if (k==4)
892  break ;
893  TreeMem[node].seq[n]=k ;
894  break ;
895  }
896  else
897  {
898  TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats)
899  int32_t k ;
900  for (k=0; k<4; k++)
901  if (TreeMem[last_node].children[k]!=NO_CHILD)
902  break ;
903  if (k==4)
904  break ;
905  TreeMem[node].seq[n]=k ;
906  last_node=TreeMem[last_node].children[k] ;
907  }
908  }
909  TreeMem[start_node].children[q]=-node ;
910  }
911  return -1 ;
912  }
913  if (num_used==0)
914  return 0 ;
915 
916  ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
917  if (ret<0)
918  return ret ;
919  return ret+1 ;
920 }
921 
922 
923  template <class Trie>
925  int32_t node, const CTrie<Trie> & other, int32_t other_node)
926 {
927  SG_DEBUG("checking nodes %i and %i\n", node, other_node)
928  if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
929  {
930  SG_DEBUG("CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight)
931  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
932  display_node(node) ;
933  SG_DEBUG("============================================================\n")
934  other.display_node(other_node) ;
935  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
936  return false ;
937  }
938 
939 #ifdef TRIE_CHECK_EVERYTHING
940  if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
941  {
942  SG_DEBUG("CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq)
943  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
944  display_node(node) ;
945  SG_DEBUG("============================================================\n")
946  other.display_node(other_node) ;
947  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
948  return false ;
949  }
950  if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
951  {
952  SG_DEBUG("CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats)
953  return false ;
954  }
955  if (other.TreeMem[other_node].has_floats)
956  {
957  for (int32_t q=0; q<4; q++)
958  if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
959  {
960  SG_DEBUG("CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q])
961  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
962  display_node(node) ;
963  SG_DEBUG("============================================================\n")
964  other.display_node(other_node) ;
965  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
966  return false ;
967  }
968  }
969  if (other.TreeMem[other_node].has_seq)
970  {
971  for (int32_t q=0; q<16; q++)
972  if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
973  {
974  SG_DEBUG("CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q])
975  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
976  display_node(node) ;
977  SG_DEBUG("============================================================\n")
978  other.display_node(other_node) ;
979  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
980  return false ;
981  }
982  }
983  if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
984  {
985  for (int32_t q=0; q<4; q++)
986  {
987  if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
988  continue ;
989  if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
990  {
991  SG_DEBUG("CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q])
992  SG_DEBUG(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
993  display_node(node) ;
994  SG_DEBUG("============================================================\n")
995  other.display_node(other_node) ;
996  SG_DEBUG("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
997  return false ;
998  }
999  if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
1000  return false ;
1001  }
1002  }
1003 #else
1004  SG_ERROR("not implemented\n")
1005 #endif
1006 
1007  return true ;
1008 }
1009 
1010  template <class Trie>
1012 {
1013  bool ret=true ;
1014  for (int32_t i=0; i<length; i++)
1015  if (!compare_traverse(trees[i], other, other.trees[i]))
1016  return false ;
1017  else
1018  SG_DEBUG("two tries at %i identical\n", i)
1019 
1020  return ret ;
1021 }
1022 
1023 template <class Trie>
1025  int32_t node, int32_t * trace, int32_t& trace_len) const
1026 {
1027 #ifdef TRIE_CHECK_EVERYTHING
1028  ASSERT(trace_len-1>=0)
1029  ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
1030  if (TreeMem[trace[trace_len-1]].has_seq)
1031  return false ;
1032  if (TreeMem[trace[trace_len-1]].has_floats)
1033  return false ;
1034 
1035  for (int32_t q=0; q<4; q++)
1036  {
1037  if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
1038  continue ;
1039  int32_t tl=trace_len+1 ;
1040  if (TreeMem[trace[trace_len-1]].children[q]>=0)
1041  trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
1042  else
1043  trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
1044 
1045  if (trace[trace_len]==node)
1046  {
1047  trace_len=tl ;
1048  return true ;
1049  }
1050  if (find_node(node, trace, tl))
1051  {
1052  trace_len=tl ;
1053  return true ;
1054  }
1055  }
1056  trace_len=0 ;
1057  return false ;
1058 #else
1059  SG_ERROR("not implemented\n")
1060  return false ;
1061 #endif
1062 }
1063 
1064 template <class Trie>
1065 void CTrie<Trie>::display_node(int32_t node) const
1066 {
1067 #ifdef TRIE_CHECK_EVERYTHING
1068  int32_t * trace=SG_MALLOC(int32_t, 2*degree);
1069  int32_t trace_len=-1 ;
1070  bool found = false ;
1071  int32_t tree=-1 ;
1072  for (tree=0; tree<length; tree++)
1073  {
1074  trace[0]=trees[tree] ;
1075  trace_len=1 ;
1076  found=find_node(node, trace, trace_len) ;
1077  if (found)
1078  break ;
1079  }
1080  ASSERT(found)
1081  SG_PRINT("position %i trace: ", tree)
1082 
1083  for (int32_t i=0; i<trace_len-1; i++)
1084  {
1085  int32_t branch=-1 ;
1086  for (int32_t q=0; q<4; q++)
1087  if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
1088  {
1089  branch=q;
1090  break ;
1091  }
1092  ASSERT(branch!=-1)
1093  char acgt[5]="ACGT" ;
1094  SG_PRINT("%c", acgt[branch])
1095  }
1096  SG_PRINT("\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats)
1097  if (TreeMem[node].has_floats)
1098  {
1099  for (int32_t q=0; q<4; q++)
1100  SG_PRINT("child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q])
1101  }
1102  if (TreeMem[node].has_seq)
1103  {
1104  for (int32_t q=0; q<16; q++)
1105  SG_PRINT("seq[%i] = %i\n", q, TreeMem[node].seq[q])
1106  }
1107  if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
1108  {
1109  for (int32_t q=0; q<4; q++)
1110  {
1111  if (TreeMem[node].children[q]!=NO_CHILD)
1112  {
1113  SG_PRINT("children[%i] = %i -> \n", q, TreeMem[node].children[q])
1114  display_node(abs(TreeMem[node].children[q])) ;
1115  }
1116  else
1117  SG_PRINT("children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q])
1118  }
1119 
1120  }
1121 
1122  SG_FREE(trace);
1123 #else
1124  SG_ERROR("not implemented\n")
1125 #endif
1126 }
1127 
1128 
1129 template <class Trie> CTrie<Trie>::~CTrie()
1130 {
1131  destroy() ;
1132 
1133  SG_FREE(TreeMem) ;
1134 }
1135 
1136 template <class Trie> void CTrie<Trie>::destroy()
1137 {
1138  if (trees!=NULL)
1139  {
1140  delete_trees();
1141  for (int32_t i=0; i<length; i++)
1142  trees[i] = NO_CHILD;
1143  SG_FREE(trees);
1144 
1145  TreeMemPtr=0;
1146  length=0;
1147  trees=NULL;
1148  }
1149 }
1150 
1151 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
1152 {
1153  delete_trees(get_use_compact_terminal_nodes());
1154  degree=d;
1155 }
1156 
1157 template <class Trie> void CTrie<Trie>::create(
1158  int32_t len, bool p_use_compact_terminal_nodes)
1159 {
1160  destroy();
1161 
1162  trees=SG_MALLOC(int32_t, len);
1163  TreeMemPtr=0 ;
1164  for (int32_t i=0; i<len; i++)
1165  trees[i]=get_node(degree==1);
1166  length = len ;
1167 
1168  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1169 }
1170 
1171 
1172 template <class Trie> void CTrie<Trie>::delete_trees(
1173  bool p_use_compact_terminal_nodes)
1174 {
1175  if (trees==NULL)
1176  return;
1177 
1178  TreeMemPtr=0 ;
1179  for (int32_t i=0; i<length; i++)
1180  trees[i]=get_node(degree==1);
1181 
1182  use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
1183 }
1184 
1185  template <class Trie>
1187 {
1188  float64_t ret=0 ;
1189 
1190  if (tree==NO_CHILD)
1191  return 0 ;
1192  TRIE_ASSERT(tree>=0)
1193 
1194  if (depth==degree-2)
1195  {
1196  ret+=(TreeMem[tree].weight) ;
1197 
1198  for (int32_t k=0; k<4; k++)
1199  ret+=(TreeMem[tree].child_weights[k]) ;
1200 
1201  return ret ;
1202  }
1203 
1204  ret+=(TreeMem[tree].weight) ;
1205 
1206  for (int32_t i=0; i<4; i++)
1207  if (TreeMem[tree].children[i]!=NO_CHILD)
1208  ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
1209 
1210  return ret ;
1211 }
1212 
1213 
1214  template <class Trie>
1216 {
1217  float64_t * sum=SG_MALLOC(float64_t, length*4);
1218  for (int32_t i=0; i<length*4; i++)
1219  sum[i]=0 ;
1220  len=length ;
1221 
1222  for (int32_t i=0; i<length; i++)
1223  {
1224  TRIE_ASSERT(trees[i]!=NO_CHILD)
1225  for (int32_t k=0; k<4; k++)
1226  {
1227  sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
1228  }
1229  }
1230 
1231  return sum ;
1232 }
1233 
1234  template <class Trie>
1236  int32_t tree, int32_t i, float64_t alpha,
1237  int32_t *vec, int32_t len_rem,
1238  int32_t degree_rec, int32_t mismatch_rec,
1239  int32_t max_mismatch, float64_t * weights)
1240 {
1241  if (tree==NO_CHILD)
1242  tree=trees[i] ;
1243  TRIE_ASSERT(tree!=NO_CHILD)
1244 
1245  if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
1246  return ;
1247  const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
1248 
1249  int32_t subtree = NO_CHILD ;
1250 
1251  if (degree_rec==degree-1)
1252  {
1253  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1254  if (weights_in_tree)
1255  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
1256  else
1257  if (weights[degree_rec]!=0.0)
1258  TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1259  if (mismatch_rec+1<=max_mismatch)
1260  for (int32_t o=0; o<3; o++)
1261  {
1262  if (weights_in_tree)
1263  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1264  else
1265  if (weights[degree_rec]!=0.0)
1266  TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1267  }
1268  return ;
1269  }
1270  else
1271  {
1272  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1273  if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
1274  {
1275  subtree=TreeMem[tree].children[vec[0]] ;
1276  if (weights_in_tree)
1277  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
1278  else
1279  if (weights[degree_rec]!=0.0)
1280  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
1281  }
1282  else
1283  {
1284  int32_t tmp = get_node(degree_rec==degree-2);
1285  ASSERT(tmp>=0)
1286  TreeMem[tree].children[vec[0]]=tmp ;
1287  subtree=tmp ;
1288 #ifdef TRIE_CHECK_EVERYTHING
1289  if (degree_rec==degree-2)
1290  TreeMem[subtree].has_floats=true ;
1291 #endif
1292  if (weights_in_tree)
1293  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
1294  else
1295  if (weights[degree_rec]!=0.0)
1296  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
1297  else
1298  TreeMem[subtree].weight = 0.0 ;
1299  }
1300  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1301  &vec[1], len_rem-1,
1302  degree_rec+1, mismatch_rec, max_mismatch, weights) ;
1303 
1304  if (mismatch_rec+1<=max_mismatch)
1305  {
1306  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1307  for (int32_t o=0; o<3; o++)
1308  {
1309  int32_t ot = other[vec[0]][o] ;
1310  if (TreeMem[tree].children[ot]!=NO_CHILD)
1311  {
1312  subtree=TreeMem[tree].children[ot] ;
1313  if (weights_in_tree)
1314  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
1315  else
1316  if (weights[degree_rec]!=0.0)
1317  TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
1318  }
1319  else
1320  {
1321  int32_t tmp = get_node(degree_rec==degree-2);
1322  ASSERT(tmp>=0)
1323  TreeMem[tree].children[ot]=tmp ;
1324  subtree=tmp ;
1325 #ifdef TRIE_CHECK_EVERYTHING
1326  if (degree_rec==degree-2)
1327  TreeMem[subtree].has_floats=true ;
1328 #endif
1329 
1330  if (weights_in_tree)
1331  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
1332  else
1333  if (weights[degree_rec]!=0.0)
1334  TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
1335  else
1336  TreeMem[subtree].weight = 0.0 ;
1337  }
1338 
1339  add_example_to_tree_mismatch_recursion(subtree, i, alpha,
1340  &vec[1], len_rem-1,
1341  degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
1342  }
1343  }
1344  }
1345 }
1346 
1347  template <class Trie>
1349  int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
1350  int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
1351  int32_t offs, float64_t* result)
1352 {
1353  if (i+j<num_feat)
1354  {
1355  float64_t decay=1.0; //no decay by default
1356  //if (j>d)
1357  // decay=pow(0.5,j); //marginalize out lower order matches
1358 
1359  if (j<degree-1)
1360  {
1361  for (int32_t k=0; k<num_sym; k++)
1362  {
1363  if (TreeMem[tree].children[k]!=NO_CHILD)
1364  {
1365  int32_t child=TreeMem[tree].children[k];
1366  //continue recursion if not yet at max_degree, else add to result
1367  if (d<max_degree-1)
1368  compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1369  else
1370  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
1371 
1373  if (d==0)
1374  compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
1375  }
1376  }
1377  }
1378  else if (j==degree-1)
1379  {
1380  for (int32_t k=0; k<num_sym; k++)
1381  {
1382  //continue recursion if not yet at max_degree, else add to result
1383  if (d<max_degree-1 && i<num_feat-1)
1384  compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
1385  else
1386  result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
1387  }
1388  }
1389  }
1390 }
1391 
1392  template <class Trie>
1394  int32_t tree, const int32_t p, struct TreeParseInfo info,
1395  const int32_t depth, int32_t* const x, const int32_t k)
1396 {
1397  const int32_t num_sym = info.num_sym;
1398  const int32_t y0 = info.y0;
1399  const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
1400  //const int32_t temp = info.substrs[depth]*num_sym - ( (depth<=k) ? 0 : info.nofsKmers[k] * x[depth-k-1] );
1401  //if( !( info.y0 == temp ) ) {
1402  // printf( "\n temp=%d y0=%d k=%d depth=%d \n", temp, info.y0, k, depth );
1403  //}
1404  //ASSERT( info.y0 == temp )
1405  int32_t sym;
1406  ASSERT( depth < degree )
1407  //ASSERT( 0 <= info.substrs[depth] && info.substrs[depth] < info.nofsKmers[k] )
1408  if (depth<degree-1)
1409  {
1410  for( sym=0; sym<num_sym; ++sym ) {
1411  const int32_t childNum = TreeMem[tree].children[ sym ];
1412  if( childNum != NO_CHILD ) {
1413  int32_t child = childNum ;
1414  x[depth] = sym;
1415  info.substrs[depth+1] = y0 + sym;
1416  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1417  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1418  count( TreeMem[child].weight, depth, info, p, x, k );
1419  traverse( child, p, info, depth+1, x, k );
1420  x[depth] = -1;
1421  }
1422  }
1423  }
1424  else if( depth == degree-1 )
1425  {
1426  for( sym=0; sym<num_sym; ++sym ) {
1427  const float64_t w = TreeMem[tree].child_weights[ sym ];
1428  if( w != 0.0 ) {
1429  x[depth] = sym;
1430  info.substrs[depth+1] = y0 + sym;
1431  info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
1432  //ASSERT( info.y0 == ( info.substrs[depth+1]*num_sym - ( (depth<k) ? 0 : info.nofsKmers[k] * x[depth-k] ) ) )
1433  count( w, depth, info, p, x, k );
1434  x[depth] = -1;
1435  }
1436  }
1437  }
1438  //info.substrs[depth+1] = -1;
1439  //info.y0 = temp;
1440 }
1441 
1442  template <class Trie>
1444  const float64_t w, const int32_t depth, const struct TreeParseInfo info,
1445  const int32_t p, int32_t* x, const int32_t k)
1446 {
1447  ASSERT( fabs(w) < 1e10 )
1448  ASSERT( x[depth] >= 0 )
1449  ASSERT( x[depth+1] < 0 )
1450  if ( depth < k ) {
1451  return;
1452  }
1453  //ASSERT( info.margFactors[ depth-k ] == pow( 0.25, depth-k ) )
1454  const int32_t nofKmers = info.nofsKmers[k];
1455  const float64_t margWeight = w * info.margFactors[ depth-k ];
1456  const int32_t m_a = depth - k + 1;
1457  const int32_t m_b = info.num_feat - p;
1458  const int32_t m = ( m_a < m_b ) ? m_a : m_b;
1459  // all proper k-substrings
1460  const int32_t offset0 = nofKmers * p;
1461  int32_t i;
1462  int32_t offset;
1463  offset = offset0;
1464  for( i = 0; i < m; ++i ) {
1465  const int32_t y = info.substrs[i+k+1];
1466  info.C_k[ y + offset ] += margWeight;
1467  offset += nofKmers;
1468  }
1469  if( depth > k ) {
1470  // k-prefix
1471  const int32_t offsR = info.substrs[k+1] + offset0;
1472  info.R_k[offsR] += margWeight;
1473  // k-suffix
1474  if( p+depth-k < info.num_feat ) {
1475  const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
1476  info.L_k[offsL] += margWeight;
1477  }
1478  }
1479  // # N.x = substring represented by N
1480  // # N.d = length of N.x
1481  // # N.s = starting position of N.x
1482  // # N.w = weight for feature represented by N
1483  // if( N.d >= k )
1484  // margContrib = w / 4^(N.d-k)
1485  // for i = 1 to (N.d-k+1)
1486  // y = N.x[i:(i+k-1)] # overlapped k-mer
1487  // C_k[ N.s+i-1, y ] += margContrib
1488  // end;
1489  // if( N.d > k )
1490  // L_k[ N.s+N.d-k, N.x[N.d-k+(1:k)] ] += margContrib # j-suffix of N.x
1491  // R_k[ N.s, N.x[1:k] ] += margContrib # j-prefix of N.x
1492  // end;
1493  // end;
1494 }
1495 
1496  template <class Trie>
1498  int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
1499  float64_t *weights, bool degree_times_position_weights)
1500 {
1501  int32_t tree = trees[i] ;
1502  //ASSERT(seq_offset==0)
1503 
1504  int32_t max_depth = 0 ;
1505  float64_t* weights_column ;
1506  if (degree_times_position_weights)
1507  weights_column = &weights[(i+seq_offset)*degree] ;
1508  else
1509  weights_column = weights ;
1510 
1511  if (weights_in_tree)
1512  {
1513  for (int32_t j=0; (j<degree) && (i+j<length); j++)
1514  if (CMath::abs(weights_column[j]*alpha)>0)
1515  max_depth = j+1 ;
1516  }
1517  else
1518  // don't use the weights
1519  max_depth=degree ;
1520 
1521  for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
1522  {
1523  TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4))
1524  if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
1525  {
1526  if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
1527  {
1528  // special treatment of the next nodes
1529  TRIE_ASSERT(j >= degree-16)
1530  // get the right element
1531  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1532  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1533  int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
1534 
1535  TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax))
1536  TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq)
1537  TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats)
1538 
1539  // check whether the same string is stored
1540  int32_t mismatch_pos = -1 ;
1541  {
1542  int32_t k ;
1543  for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
1544  {
1545  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1546  // ###
1547  if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
1548  fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
1549  TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER))
1550  TRIE_ASSERT(k<16)
1551  if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
1552  {
1553  mismatch_pos=k ;
1554  break ;
1555  }
1556  }
1557  }
1558 
1559  // what happens when the .seq sequence is longer than vec? should we branch???
1560 
1561  if (mismatch_pos==-1)
1562  // if so, then just increase the weight by alpha and stop
1563  TreeMem[node].weight+=alpha ;
1564  else
1565  // otherwise
1566  // 1. replace current node with new node
1567  // 2. create new nodes until mismatching positon
1568  // 2. add a branch with old string (old node) and the new string (new node)
1569  {
1570  // replace old node
1571  int32_t last_node=tree ;
1572 
1573  // create new nodes until mismatch
1574  int32_t k ;
1575  for (k=0; k<mismatch_pos; k++)
1576  {
1577  TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4))
1578  TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k])
1579 
1580  int32_t tmp=get_node();
1581  TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
1582  last_node=tmp ;
1583  if (weights_in_tree)
1584  TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
1585  else
1586  TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
1587  TRIE_ASSERT(j+k!=degree-1)
1588  }
1589  if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
1590  fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
1591  ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER))
1592  TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos])
1593 
1594  if (j+k==degree-1)
1595  {
1596  // init child weights with zero if after dropping out
1597  // of the k<mismatch_pos loop we are one level below degree
1598  // (keep this even after get_node() change!)
1599  for (int32_t q=0; q<4; q++)
1600  TreeMem[last_node].child_weights[q]=0.0 ;
1601  if (weights_in_tree)
1602  {
1603  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1604  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
1605  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
1606  }
1607  else
1608  {
1609  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1610  TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
1611  TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
1612  }
1613 
1614 #ifdef TRIE_CHECK_EVERYTHING
1615  TreeMem[last_node].has_floats=true ;
1616 #endif
1617  }
1618  else
1619  {
1620  // the branch for the existing string
1621  if (TreeMem[node].seq[mismatch_pos]<4) // i.e. !=TRIE_TERMINAL_CHARACTER
1622  {
1623  TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
1624 
1625  // move string by mismatch_pos positions
1626  for (int32_t q=0; q<16; q++)
1627  {
1628  if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
1629  TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
1630  else
1631  TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
1632  }
1633 #ifdef TRIE_CHECK_EVERYTHING
1634  TreeMem[node].has_seq=true ;
1635 #endif
1636  }
1637 
1638  // the new branch
1639  TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4))
1640  int32_t tmp = get_node() ;
1641  TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
1642  last_node=tmp ;
1643 
1644  TreeMem[last_node].weight = alpha ;
1645 #ifdef TRIE_CHECK_EVERYTHING
1646  TreeMem[last_node].has_seq = true ;
1647 #endif
1648  memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1649  for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
1650  TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
1651  }
1652  }
1653  break ;
1654  }
1655  else
1656  {
1657  tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
1658  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1659  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1660  if (weights_in_tree)
1661  TreeMem[tree].weight += alpha*weights_column[j];
1662  else
1663  TreeMem[tree].weight += alpha ;
1664  }
1665  }
1666  else if (j==degree-1)
1667  {
1668  // special treatment of the last node
1669  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1670  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1671  if (weights_in_tree)
1672  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
1673  else
1674  TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
1675 
1676  break;
1677  }
1678  else
1679  {
1680  bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
1681  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1682  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1683 
1684  int32_t tmp = get_node((j==degree-2) && (!use_seq));
1685  if (use_seq)
1686  TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
1687  else
1688  TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
1689  tree=tmp ;
1690 
1691  TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax))
1692 #ifdef TRIE_CHECK_EVERYTHING
1693  TreeMem[tree].has_seq = use_seq ;
1694 #endif
1695  if (use_seq)
1696  {
1697  TreeMem[tree].weight = alpha ;
1698  // important to have the terminal characters (see ###)
1699  memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
1700  for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
1701  {
1702  TRIE_ASSERT(q<16)
1703  TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
1704  }
1705  break ;
1706  }
1707  else
1708  {
1709  if (weights_in_tree)
1710  TreeMem[tree].weight = alpha*weights_column[j] ;
1711  else
1712  TreeMem[tree].weight = alpha ;
1713 #ifdef TRIE_CHECK_EVERYTHING
1714  if (j==degree-2)
1715  TreeMem[tree].has_floats = true ;
1716 #endif
1717  }
1718  }
1719  }
1720 }
1721 
1722  template <class Trie>
1724  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1725  int32_t weight_pos, float64_t* weights,
1726  bool degree_times_position_weights)
1727 {
1728  int32_t tree = trees[tree_pos] ;
1729 
1730  if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
1731  return 0.0;
1732 
1733  float64_t *weights_column=NULL ;
1734  if (degree_times_position_weights)
1735  weights_column=&weights[weight_pos*degree] ;
1736  else // weights is a vector (1 x degree)
1737  weights_column=weights ;
1738 
1739  float64_t sum=0 ;
1740  for (int32_t j=0; seq_pos+j < len; j++)
1741  {
1742  TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0))
1743 
1744  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1745  {
1746  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1747  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1748  {
1749  tree = - TreeMem[tree].children[vec[seq_pos+j]];
1750  TRIE_ASSERT(tree>=0)
1751  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1752  float64_t this_weight=0.0 ;
1753  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1754  {
1755  TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0))
1756  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1757  break ;
1758  this_weight += weights_column[j+k] ;
1759  }
1760  sum += TreeMem[tree].weight * this_weight ;
1761  break ;
1762  }
1763  else
1764  {
1765  tree=TreeMem[tree].children[vec[seq_pos+j]];
1766  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1767  if (weights_in_tree)
1768  sum += TreeMem[tree].weight ;
1769  else
1770  sum += TreeMem[tree].weight * weights_column[j] ;
1771  } ;
1772  }
1773  else
1774  {
1775  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1776  if (j==degree-1)
1777  {
1778  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats)
1779  if (weights_in_tree)
1780  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1781  else
1782  sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
1783  }
1784  else
1785  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats)
1786 
1787  break;
1788  }
1789  }
1790 
1791  if (position_weights!=NULL)
1792  return sum*position_weights[weight_pos] ;
1793  else
1794  return sum ;
1795 }
1796 
1797  template <class Trie>
1799  int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
1800  int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
1801  int32_t mkl_stepsize, float64_t * weights,
1802  bool degree_times_position_weights)
1803 {
1804  int32_t tree = trees[tree_pos] ;
1805  if (factor==0)
1806  return ;
1807 
1808  if (position_weights!=NULL)
1809  {
1810  factor *= position_weights[weight_pos] ;
1811  if (factor==0)
1812  return ;
1813  if (!degree_times_position_weights) // with position_weigths, weights is a vector (1 x degree)
1814  {
1815  for (int32_t j=0; seq_pos+j<len; j++)
1816  {
1817  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1818  {
1819  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1820  {
1821  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1822  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1823  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1824  {
1825  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1826  break ;
1827  if (weights_in_tree)
1828  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1829  else
1830  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1831  }
1832  break ;
1833  }
1834  else
1835  {
1836  tree=TreeMem[tree].children[vec[seq_pos+j]];
1837  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1838  if (weights_in_tree)
1839  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1840  else
1841  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1842  }
1843  }
1844  else
1845  {
1846  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1847  if (j==degree-1)
1848  {
1849  if (weights_in_tree)
1850  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1851  else
1852  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1853  }
1854  }
1855  }
1856  }
1857  else // with position_weigths, weights is a matrix (len x degree)
1858  {
1859  for (int32_t j=0; seq_pos+j<len; j++)
1860  {
1861  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1862  {
1863  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1864  {
1865  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1866  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1867  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1868  {
1869  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1870  break ;
1871  if (weights_in_tree)
1872  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1873  else
1874  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1875  }
1876  break ;
1877  }
1878  else
1879  {
1880  tree=TreeMem[tree].children[vec[seq_pos+j]];
1881  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1882  if (weights_in_tree)
1883  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
1884  else
1885  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
1886  }
1887  }
1888  else
1889  {
1890  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1891  if (j==degree-1)
1892  {
1893  if (weights_in_tree)
1894  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1895  else
1896  LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
1897  }
1898  break ;
1899  }
1900  }
1901  }
1902  }
1903  else if (!degree_times_position_weights) // no position_weigths, weights is a vector (1 x degree)
1904  {
1905  for (int32_t j=0; seq_pos+j<len; j++)
1906  {
1907  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1908  {
1909  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1910  {
1911  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1912  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1913  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1914  {
1915  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1916  break ;
1917  if (weights_in_tree)
1918  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1919  else
1920  LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
1921  }
1922  break ;
1923  }
1924  else
1925  {
1926  tree=TreeMem[tree].children[vec[seq_pos+j]];
1927  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1928  if (weights_in_tree)
1929  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
1930  else
1931  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
1932  }
1933  }
1934  else
1935  {
1936  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1937  if (j==degree-1)
1938  {
1939  if (weights_in_tree)
1940  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
1941  else
1942  LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
1943  }
1944  break ;
1945  }
1946  }
1947  }
1948  else // no position_weigths, weights is a matrix (len x degree)
1949  {
1950  /*if (!position_mask)
1951  {
1952  position_mask = SG_MALLOC(bool, len);
1953  for (int32_t i=0; i<len; i++)
1954  {
1955  position_mask[i]=false ;
1956 
1957  for (int32_t j=0; j<degree; j++)
1958  if (weights[i*degree+j]!=0.0)
1959  {
1960  position_mask[i]=true ;
1961  break ;
1962  }
1963  }
1964  }
1965  if (position_mask[weight_pos]==0)
1966  return ;*/
1967 
1968  for (int32_t j=0; seq_pos+j<len; j++)
1969  {
1970  if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
1971  {
1972  if (TreeMem[tree].children[vec[seq_pos+j]]<0)
1973  {
1974  tree = -TreeMem[tree].children[vec[seq_pos+j]];
1975  TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq)
1976  for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
1977  {
1978  if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
1979  break ;
1980  if (weights_in_tree)
1981  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
1982  else
1983  LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
1984  }
1985  break ;
1986  }
1987  else
1988  {
1989  tree=TreeMem[tree].children[vec[seq_pos+j]];
1990  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
1991  if (weights_in_tree)
1992  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
1993  else
1994  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
1995  }
1996  }
1997  else
1998  {
1999  TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq)
2000  if (j==degree-1)
2001  {
2002  if (weights_in_tree)
2003  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
2004  else
2005  LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
2006  }
2007  break ;
2008  }
2009  }
2010  }
2011 }
2012 
2013  template <class Trie>
2015  Trie* tree, int32_t depth, uint64_t seq, float64_t value,
2016  DynArray<ConsensusEntry>* table, float64_t* weights)
2017 {
2018  float64_t w=1.0;
2019 
2020  if (weights_in_tree || depth==0)
2021  value+=tree->weight;
2022  else
2023  {
2024  w=weights[degree-1];
2025  value+=weights[depth-1]*tree->weight;
2026  }
2027 
2028  if (degree-1==depth)
2029  {
2030  for (int32_t sym=0; sym<4; sym++)
2031  {
2032  float64_t v=w*tree->child_weights[sym];
2033  if (v!=0.0)
2034  {
2035  ConsensusEntry entry;
2036  entry.bt=-1;
2037  entry.score=value+v;
2038  entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2039 
2040  table->append_element(entry);
2041  }
2042  }
2043  }
2044  else
2045  {
2046  for (int32_t sym=0; sym<4; sym++)
2047  {
2048  uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
2049  if (tree->children[sym] != NO_CHILD)
2050  fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
2051  }
2052  }
2053 }
2054 
2055  template <class Trie>
2057  int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
2058 {
2059  float64_t result=0.0;
2060 
2061  //SG_PRINT("pos:%i length:%i deg:%i seq:0x%0llx...\n", pos, length, deg, seq)
2062 
2063  for (int32_t i=pos; i<pos+deg && i<length; i++)
2064  {
2065  //SG_PRINT("loop %d\n", i)
2066  Trie* tree = &TreeMem[trees[i]];
2067 
2068  for (int32_t d=0; d<deg-i+pos; d++)
2069  {
2070  //SG_PRINT("loop degree %d shit: %d\n", d, (2*(deg-1-d-i+pos)))
2071  ASSERT(d-1<degree)
2072  int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
2073 
2074  float64_t w=1.0;
2075  if (!weights_in_tree)
2076  w=weights[d];
2077 
2078  ASSERT(tree->children[sym] != NO_CHILD)
2079  tree=&TreeMem[tree->children[sym]];
2080  result+=w*tree->weight;
2081  }
2082  }
2083  //SG_PRINT("cum: %f\n", result)
2084  return result;
2085 }
2086 
2087  template <class Trie>
2089  int32_t pos, DynArray<ConsensusEntry>* prev,
2090  DynArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
2091 {
2092  ASSERT(pos>=0 && pos<length)
2093  ASSERT(!use_compact_terminal_nodes)
2094 
2095  Trie* t = &TreeMem[trees[pos]];
2096 
2097  fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
2098 
2099 
2100  if (cumulative)
2101  {
2102  int32_t num_cur=cur->get_num_elements();
2103  for (int32_t i=0; i<num_cur; i++)
2104  {
2105  ConsensusEntry entry=cur->get_element(i);
2106  entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
2107  cur->set_element(entry,i);
2108  //SG_PRINT("cum: str:0%0llx sc:%f bt:%d\n",entry.string,entry.score,entry.bt)
2109  }
2110  }
2111 
2112  //if previous tree exists find maximum scoring path
2113  //for each element in cur and update bt table
2114  if (prev)
2115  {
2116  int32_t num_cur=cur->get_num_elements();
2117  int32_t num_prev=prev->get_num_elements();
2118 
2119  for (int32_t i=0; i<num_cur; i++)
2120  {
2121  //uint64_t str_cur_old= cur->get_element(i).string;
2122  uint64_t str_cur= cur->get_element(i).string >> 2;
2123  //SG_PRINT("...cur:0x%0llx cur_noprfx:0x%0llx...\n", str_cur_old, str_cur)
2124 
2125  int32_t bt=-1;
2126  float64_t max_score=0.0;
2127 
2128  for (int32_t j=0; j<num_prev; j++)
2129  {
2130  //uint64_t str_prev_old= prev->get_element(j).string;
2131  uint64_t mask=
2132  ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
2133  uint64_t str_prev= mask & prev->get_element(j).string;
2134  //SG_PRINT("...prev:0x%0llx prev_nosfx:0x%0llx mask:%0llx...\n", str_prev_old, str_prev,mask)
2135 
2136  if (str_cur == str_prev)
2137  {
2138  float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
2139  if (bt==-1 || sc>max_score)
2140  {
2141  bt=j;
2142  max_score=sc;
2143 
2144  //SG_PRINT("new_max[%i,%i] = %f\n", j,i, max_score)
2145  }
2146  }
2147  }
2148 
2149  ASSERT(bt!=-1)
2150  ConsensusEntry entry;
2151  entry.bt=bt;
2152  entry.score=max_score;
2153  entry.string=cur->get_element(i).string;
2154  cur->set_element(entry, i);
2155  //SG_PRINT("entry[%d]: str:0%0llx sc:%f bt:%d\n",i, entry.string,entry.score,entry.bt)
2156  }
2157  }
2158 }
2159 }
2160 #endif // _TRIE_H___

SHOGUN Machine Learning Toolbox - Documentation