00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _TRIE_H___
00013 #define _TRIE_H___
00014
00015 #include <string.h>
00016 #include "lib/common.h"
00017 #include "lib/io.h"
00018 #include "base/DynArray.h"
00019 #include "lib/Mathematics.h"
00020 #include "base/SGObject.h"
00021
00022 namespace shogun
00023 {
00024 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00025
00026
00027 #define NO_CHILD ((int32_t)-1073741824)
00028
00029 #define WEIGHTS_IN_TRIE
00030
00031
00032 #ifdef TRIE_CHECK_EVERYTHING
00033 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
00034 #else
00035 #define TRIE_ASSERT_EVERYTHING(x)
00036 #endif
00037
00038
00039 #define TRIE_ASSERT(x)
00040
00041 #define TRIE_TERMINAL_CHARACTER 7
00042
00044 struct ConsensusEntry
00045 {
00047 uint64_t string;
00049 float32_t score;
00051 int32_t bt;
00052 };
00053
00055 struct POIMTrie
00056 {
00058 float64_t weight;
00059 #ifdef TRIE_CHECK_EVERYTHING
00060
00061 bool has_seq;
00063 bool has_floats;
00064 #endif
00065 union
00066 {
00068 float32_t child_weights[4];
00070 int32_t children[4];
00072 uint8_t seq[16] ;
00073 };
00074
00076 float64_t S;
00078 float64_t L;
00080 float64_t R;
00081 };
00082
00084 struct DNATrie
00085 {
00087 float64_t weight;
00088 #ifdef TRIE_CHECK_EVERYTHING
00089
00090 bool has_seq;
00092 bool has_floats;
00093 #endif
00094 union
00095 {
00097 float32_t child_weights[4];
00099 int32_t children[4];
00101 uint8_t seq[16] ;
00102 };
00103 };
00104
00106 struct TreeParseInfo {
00108 int32_t num_sym;
00110 int32_t num_feat;
00112 int32_t p;
00114 int32_t k;
00116 int32_t* nofsKmers;
00118 float64_t* margFactors;
00120 int32_t* x;
00122 int32_t* substrs;
00124 int32_t y0;
00126 float64_t* C_k;
00128 float64_t* L_k;
00130 float64_t* R_k;
00131 };
00132
00133 #endif // DOXYGEN_SHOULD_SKIP_THIS
00134
00135 template <class Trie> class CTrie;
00136
00155 #define IGNORE_IN_CLASSLIST
00156 IGNORE_IN_CLASSLIST template <class Trie> class CTrie : public CSGObject
00157 {
00158 public:
00160 CTrie();
00161
00168 CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
00169
00171 CTrie(const CTrie & to_copy);
00172 virtual ~CTrie();
00173
00175 const CTrie & operator=(const CTrie & to_copy);
00176
00184 bool compare_traverse(
00185 int32_t node, const CTrie & other, int32_t other_node);
00186
00192 bool compare(const CTrie & other);
00193
00200 bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
00201
00208 int32_t find_deepest_node(
00209 int32_t start_node, int32_t &deepest_node) const;
00210
00215 void display_node(int32_t node) const;
00216
00218 void destroy();
00219
00224 void set_degree(int32_t d);
00225
00232 void create(int32_t len, bool p_use_compact_terminal_nodes=true);
00233
00239 void delete_trees(bool p_use_compact_terminal_nodes=true);
00240
00251 void add_to_trie(
00252 int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
00253 float64_t *weights, bool degree_times_position_weights);
00254
00261 float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
00262
00268 float64_t* compute_abs_weights(int32_t &len);
00269
00282 float64_t compute_by_tree_helper(
00283 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
00284 int32_t weight_pos, float64_t * weights,
00285 bool degree_times_position_weights) ;
00286
00301 void compute_by_tree_helper(
00302 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
00303 int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
00304 int32_t mkl_stepsize, float64_t * weights,
00305 bool degree_times_position_weights);
00306
00321 void compute_scoring_helper(
00322 int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
00323 int32_t max_degree, int32_t num_feat, int32_t num_sym,
00324 int32_t sym_offset, int32_t offs, float64_t* result);
00325
00338 void add_example_to_tree_mismatch_recursion(
00339 int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
00340 int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
00341 int32_t max_mismatch, float64_t * weights);
00342
00352 void traverse(
00353 int32_t tree, const int32_t p, struct TreeParseInfo info,
00354 const int32_t depth, int32_t* const x, const int32_t k);
00355
00365 void count(
00366 const float64_t w, const int32_t depth,
00367 const struct TreeParseInfo info, const int32_t p, int32_t* x,
00368 const int32_t k);
00369
00376 int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
00377
00386 float64_t get_cumulative_score(
00387 int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
00388
00398 void fill_backtracking_table_recursion(
00399 Trie* tree, int32_t depth, uint64_t seq, float64_t value,
00400 DynArray<ConsensusEntry>* table, float64_t* weights);
00401
00410 void fill_backtracking_table(
00411 int32_t pos, DynArray<ConsensusEntry>* prev,
00412 DynArray<ConsensusEntry>* cur, bool cumulative,
00413 float64_t* weights);
00414
00420 void POIMs_extract_W(float64_t* const* const W, const int32_t K);
00421
00426 void POIMs_precalc_SLR(const float64_t* const distrib);
00427
00437 void POIMs_get_SLR(
00438 const int32_t parentIdx, const int32_t sym, const int32_t depth,
00439 float64_t* S, float64_t* L, float64_t* R);
00440
00447 void POIMs_add_SLR(
00448 float64_t* const* const poims, const int32_t K,
00449 const int32_t debug);
00450
00455 inline bool get_use_compact_terminal_nodes()
00456 {
00457 return use_compact_terminal_nodes ;
00458 }
00459
00465 inline void set_use_compact_terminal_nodes(
00466 bool p_use_compact_terminal_nodes)
00467 {
00468 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
00469 }
00470
00475 inline int32_t get_num_used_nodes()
00476 {
00477 return TreeMemPtr;
00478 }
00479
00484 inline void set_position_weights(const float64_t * p_position_weights)
00485 {
00486 position_weights=p_position_weights;
00487 }
00488
00493 inline int32_t get_node(bool last_node=false)
00494 {
00495 int32_t ret = TreeMemPtr++;
00496 check_treemem() ;
00497
00498 if (last_node)
00499 {
00500 for (int32_t q=0; q<4; q++)
00501 TreeMem[ret].child_weights[q]=0.0;
00502 }
00503 else
00504 {
00505 for (int32_t q=0; q<4; q++)
00506 TreeMem[ret].children[q]=NO_CHILD;
00507 }
00508 #ifdef TRIE_CHECK_EVERYTHING
00509 TreeMem[ret].has_seq=false ;
00510 TreeMem[ret].has_floats=false ;
00511 #endif
00512 TreeMem[ret].weight=0.0;
00513 return ret ;
00514 }
00515
00517 inline void check_treemem()
00518 {
00519 if (TreeMemPtr+10 < TreeMemPtrMax)
00520 return;
00521 SG_DEBUG( "Extending TreeMem from %i to %i elements\n",
00522 TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
00523 TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
00524 TreeMem = (Trie*) realloc(TreeMem,
00525 TreeMemPtrMax*sizeof(Trie));
00526 if (!TreeMem)
00527 SG_ERROR( "out of memory\n");
00528 }
00529
00534 inline void set_weights_in_tree(bool weights_in_tree_)
00535 {
00536 weights_in_tree = weights_in_tree_;
00537 }
00538
00543 inline bool get_weights_in_tree()
00544 {
00545 return weights_in_tree;
00546 }
00547
00557 void POIMs_extract_W_helper(
00558 const int32_t nodeIdx, const int32_t depth, const int32_t offset,
00559 const int32_t y0, float64_t* const* const W, const int32_t K);
00560
00573 void POIMs_calc_SLR_helper1(
00574 const float64_t* const distrib, const int32_t i,
00575 const int32_t nodeIdx, int32_t left_tries_idx[4],
00576 const int32_t depth, int32_t const lastSym, float64_t* S,
00577 float64_t* L, float64_t* R);
00578
00579
00590 void POIMs_calc_SLR_helper2(
00591 const float64_t* const distrib, const int32_t i,
00592 const int32_t nodeIdx, int32_t left_tries_idx[4],
00593 const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
00594
00605 void POIMs_add_SLR_helper1(
00606 const int32_t nodeIdx, const int32_t depth,const int32_t i,
00607 const int32_t y0, float64_t* const* const poims, const int32_t K,
00608 const int32_t debug);
00609
00623 void POIMs_add_SLR_helper2(
00624 float64_t* const* const poims, const int32_t K, const int32_t k,
00625 const int32_t i, const int32_t y, const float64_t valW,
00626 const float64_t valS, const float64_t valL, const float64_t valR,
00627 const int32_t debug);
00628
00630 inline virtual const char* get_name() const { return "Trie"; }
00631
00632 public:
00634 int32_t NUM_SYMS;
00635
00636 protected:
00638 int32_t length;
00640 int32_t * trees;
00641
00643 int32_t degree;
00645 float64_t const * position_weights;
00646
00648 Trie* TreeMem;
00650 int32_t TreeMemPtr;
00652 int32_t TreeMemPtrMax;
00654 bool use_compact_terminal_nodes;
00655
00657 bool weights_in_tree;
00658
00660 int32_t* nofsKmers;
00661 };
00662 template <class Trie>
00663 CTrie<Trie>::CTrie()
00664 : CSGObject(), degree(0), position_weights(NULL),
00665 use_compact_terminal_nodes(false),
00666 weights_in_tree(true)
00667 {
00668
00669 TreeMemPtrMax=0;
00670 TreeMemPtr=0;
00671 TreeMem=NULL;
00672
00673 length=0;
00674 trees=NULL;
00675
00676 NUM_SYMS=4;
00677 }
00678
00679 template <class Trie>
00680 CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
00681 : CSGObject(), degree(d), position_weights(NULL),
00682 use_compact_terminal_nodes(p_use_compact_terminal_nodes),
00683 weights_in_tree(true)
00684 {
00685 TreeMemPtrMax=1024*1024/sizeof(Trie);
00686 TreeMemPtr=0;
00687 TreeMem=(Trie*)malloc(TreeMemPtrMax*sizeof(Trie));
00688
00689 length=0;
00690 trees=NULL;
00691
00692 NUM_SYMS=4;
00693 }
00694
00695 template <class Trie>
00696 CTrie<Trie>::CTrie(const CTrie & to_copy)
00697 : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
00698 use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
00699 {
00700 if (to_copy.position_weights!=NULL)
00701 {
00702 position_weights = to_copy.position_weights;
00703
00704
00705
00706 }
00707 else
00708 position_weights=NULL;
00709
00710 TreeMemPtrMax=to_copy.TreeMemPtrMax;
00711 TreeMemPtr=to_copy.TreeMemPtr;
00712 TreeMem=(Trie*)malloc(TreeMemPtrMax*sizeof(Trie));
00713 memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
00714
00715 length=to_copy.length;
00716 trees=new int32_t[length];
00717 for (int32_t i=0; i<length; i++)
00718 trees[i]=to_copy.trees[i];
00719
00720 NUM_SYMS=4;
00721 }
00722
00723 template <class Trie>
00724 const CTrie<Trie> &CTrie<Trie>::operator=(const CTrie<Trie> & to_copy)
00725 {
00726 degree=to_copy.degree ;
00727 use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
00728
00729 delete[] position_weights ;
00730 position_weights=NULL ;
00731 if (to_copy.position_weights!=NULL)
00732 {
00733 position_weights=to_copy.position_weights ;
00734
00735
00736
00737 }
00738 else
00739 position_weights=NULL ;
00740
00741 TreeMemPtrMax=to_copy.TreeMemPtrMax ;
00742 TreeMemPtr=to_copy.TreeMemPtr ;
00743 free(TreeMem) ;
00744 TreeMem = (Trie*)malloc(TreeMemPtrMax*sizeof(Trie)) ;
00745 memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
00746
00747 length = to_copy.length ;
00748 if (trees)
00749 delete[] trees ;
00750 trees=new int32_t[length] ;
00751 for (int32_t i=0; i<length; i++)
00752 trees[i]=to_copy.trees[i] ;
00753
00754 return *this ;
00755 }
00756
00757 template <class Trie>
00758 int32_t CTrie<Trie>::find_deepest_node(
00759 int32_t start_node, int32_t& deepest_node) const
00760 {
00761 #ifdef TRIE_CHECK_EVERYTHING
00762 int32_t ret=0 ;
00763 SG_DEBUG("start_node=%i\n", start_node) ;
00764
00765 if (start_node==NO_CHILD)
00766 {
00767 for (int32_t i=0; i<length; i++)
00768 {
00769 int32_t my_deepest_node ;
00770 int32_t depth=find_deepest_node(i, my_deepest_node) ;
00771 SG_DEBUG("start_node %i depth=%i\n", i, depth) ;
00772 if (depth>ret)
00773 {
00774 deepest_node=my_deepest_node ;
00775 ret=depth ;
00776 }
00777 }
00778 return ret ;
00779 }
00780
00781 if (TreeMem[start_node].has_seq)
00782 {
00783 for (int32_t q=0; q<16; q++)
00784 if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
00785 ret++ ;
00786 deepest_node=start_node ;
00787 return ret ;
00788 }
00789 if (TreeMem[start_node].has_floats)
00790 {
00791 deepest_node=start_node ;
00792 return 1 ;
00793 }
00794
00795 for (int32_t q=0; q<4; q++)
00796 {
00797 int32_t my_deepest_node ;
00798 if (TreeMem[start_node].children[q]==NO_CHILD)
00799 continue ;
00800 int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
00801 if (depth>ret)
00802 {
00803 deepest_node=my_deepest_node ;
00804 ret=depth ;
00805 }
00806 }
00807 return ret ;
00808 #else
00809 SG_ERROR( "not implemented\n") ;
00810 return 0 ;
00811 #endif
00812 }
00813
00814 template <class Trie>
00815 int32_t CTrie<Trie>::compact_nodes(
00816 int32_t start_node, int32_t depth, float64_t * weights)
00817 {
00818 SG_ERROR( "code buggy\n") ;
00819
00820 int32_t ret=0 ;
00821
00822 if (start_node==NO_CHILD)
00823 {
00824 for (int32_t i=0; i<length; i++)
00825 compact_nodes(i,1, weights) ;
00826 return 0 ;
00827 }
00828 if (start_node<0)
00829 return -1 ;
00830
00831 if (depth==degree-1)
00832 {
00833 TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats) ;
00834 int32_t num_used=0 ;
00835 for (int32_t q=0; q<4; q++)
00836 if (TreeMem[start_node].child_weights[q]!=0.0)
00837 num_used++ ;
00838 if (num_used>1)
00839 return -1 ;
00840 return 1 ;
00841 }
00842 TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats) ;
00843
00844 int32_t num_used = 0 ;
00845 int32_t q_used=-1 ;
00846
00847 for (int32_t q=0; q<4; q++)
00848 {
00849 if (TreeMem[start_node].children[q]==NO_CHILD)
00850 continue ;
00851 num_used++ ;
00852 q_used=q ;
00853 }
00854 if (num_used>1)
00855 {
00856 if (depth>=degree-2)
00857 return -1 ;
00858 for (int32_t q=0; q<4; q++)
00859 {
00860 if (TreeMem[start_node].children[q]==NO_CHILD)
00861 continue ;
00862 int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
00863 if (num<=2)
00864 continue ;
00865 int32_t node=get_node() ;
00866
00867 int32_t last_node=TreeMem[start_node].children[q] ;
00868 if (weights_in_tree)
00869 {
00870 ASSERT(weights[depth]!=0.0) ;
00871 TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
00872 }
00873 else
00874 TreeMem[node].weight=TreeMem[last_node].weight ;
00875
00876 #ifdef TRIE_CHECK_EVERYTHING
00877 TreeMem[node].has_seq=true ;
00878 #endif
00879 memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
00880 for (int32_t n=0; n<num; n++)
00881 {
00882 ASSERT(depth+n+1<=degree-1) ;
00883 ASSERT(last_node!=NO_CHILD) ;
00884 if (depth+n+1==degree-1)
00885 {
00886 TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats) ;
00887 int32_t k ;
00888 for (k=0; k<4; k++)
00889 if (TreeMem[last_node].child_weights[k]!=0.0)
00890 break ;
00891 if (k==4)
00892 break ;
00893 TreeMem[node].seq[n]=k ;
00894 break ;
00895 }
00896 else
00897 {
00898 TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats) ;
00899 int32_t k ;
00900 for (k=0; k<4; k++)
00901 if (TreeMem[last_node].children[k]!=NO_CHILD)
00902 break ;
00903 if (k==4)
00904 break ;
00905 TreeMem[node].seq[n]=k ;
00906 last_node=TreeMem[last_node].children[k] ;
00907 }
00908 }
00909 TreeMem[start_node].children[q]=-node ;
00910 }
00911 return -1 ;
00912 }
00913 if (num_used==0)
00914 return 0 ;
00915
00916 ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
00917 if (ret<0)
00918 return ret ;
00919 return ret+1 ;
00920 }
00921
00922
00923 template <class Trie>
00924 bool CTrie<Trie>::compare_traverse(
00925 int32_t node, const CTrie<Trie> & other, int32_t other_node)
00926 {
00927 SG_DEBUG("checking nodes %i and %i\n", node, other_node) ;
00928 if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
00929 {
00930 SG_DEBUG( "CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight) ;
00931 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00932 display_node(node) ;
00933 SG_DEBUG( "============================================================\n") ;
00934 other.display_node(other_node) ;
00935 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00936 return false ;
00937 }
00938
00939 #ifdef TRIE_CHECK_EVERYTHING
00940 if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
00941 {
00942 SG_DEBUG( "CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq) ;
00943 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00944 display_node(node) ;
00945 SG_DEBUG( "============================================================\n") ;
00946 other.display_node(other_node) ;
00947 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00948 return false ;
00949 }
00950 if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
00951 {
00952 SG_DEBUG( "CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats) ;
00953 return false ;
00954 }
00955 if (other.TreeMem[other_node].has_floats)
00956 {
00957 for (int32_t q=0; q<4; q++)
00958 if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
00959 {
00960 SG_DEBUG( "CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q]) ;
00961 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00962 display_node(node) ;
00963 SG_DEBUG( "============================================================\n") ;
00964 other.display_node(other_node) ;
00965 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00966 return false ;
00967 }
00968 }
00969 if (other.TreeMem[other_node].has_seq)
00970 {
00971 for (int32_t q=0; q<16; q++)
00972 if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
00973 {
00974 SG_DEBUG( "CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q]) ;
00975 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00976 display_node(node) ;
00977 SG_DEBUG( "============================================================\n") ;
00978 other.display_node(other_node) ;
00979 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00980 return false ;
00981 }
00982 }
00983 if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
00984 {
00985 for (int32_t q=0; q<4; q++)
00986 {
00987 if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
00988 continue ;
00989 if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
00990 {
00991 SG_DEBUG( "CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q]) ;
00992 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00993 display_node(node) ;
00994 SG_DEBUG( "============================================================\n") ;
00995 other.display_node(other_node) ;
00996 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00997 return false ;
00998 }
00999 if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
01000 return false ;
01001 }
01002 }
01003 #else
01004 SG_ERROR( "not implemented\n") ;
01005 #endif
01006
01007 return true ;
01008 }
01009
01010 template <class Trie>
01011 bool CTrie<Trie>::compare(const CTrie<Trie> & other)
01012 {
01013 bool ret=true ;
01014 for (int32_t i=0; i<length; i++)
01015 if (!compare_traverse(trees[i], other, other.trees[i]))
01016 return false ;
01017 else
01018 SG_DEBUG("two tries at %i identical\n", i) ;
01019
01020 return ret ;
01021 }
01022
01023 template <class Trie>
01024 bool CTrie<Trie>::find_node(
01025 int32_t node, int32_t * trace, int32_t& trace_len) const
01026 {
01027 #ifdef TRIE_CHECK_EVERYTHING
01028 ASSERT(trace_len-1>=0) ;
01029 ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
01030 if (TreeMem[trace[trace_len-1]].has_seq)
01031 return false ;
01032 if (TreeMem[trace[trace_len-1]].has_floats)
01033 return false ;
01034
01035 for (int32_t q=0; q<4; q++)
01036 {
01037 if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
01038 continue ;
01039 int32_t tl=trace_len+1 ;
01040 if (TreeMem[trace[trace_len-1]].children[q]>=0)
01041 trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
01042 else
01043 trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
01044
01045 if (trace[trace_len]==node)
01046 {
01047 trace_len=tl ;
01048 return true ;
01049 }
01050 if (find_node(node, trace, tl))
01051 {
01052 trace_len=tl ;
01053 return true ;
01054 }
01055 }
01056 trace_len=0 ;
01057 return false ;
01058 #else
01059 SG_ERROR( "not implemented\n") ;
01060 return false ;
01061 #endif
01062 }
01063
01064 template <class Trie>
01065 void CTrie<Trie>::display_node(int32_t node) const
01066 {
01067 #ifdef TRIE_CHECK_EVERYTHING
01068 int32_t * trace=new int32_t[2*degree] ;
01069 int32_t trace_len=-1 ;
01070 bool found = false ;
01071 int32_t tree=-1 ;
01072 for (tree=0; tree<length; tree++)
01073 {
01074 trace[0]=trees[tree] ;
01075 trace_len=1 ;
01076 found=find_node(node, trace, trace_len) ;
01077 if (found)
01078 break ;
01079 }
01080 ASSERT(found) ;
01081 SG_PRINT( "position %i trace: ", tree) ;
01082
01083 for (int32_t i=0; i<trace_len-1; i++)
01084 {
01085 int32_t branch=-1 ;
01086 for (int32_t q=0; q<4; q++)
01087 if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
01088 {
01089 branch=q;
01090 break ;
01091 }
01092 ASSERT(branch!=-1) ;
01093 char acgt[5]="ACGT" ;
01094 SG_PRINT( "%c", acgt[branch]) ;
01095 }
01096 SG_PRINT( "\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats) ;
01097 if (TreeMem[node].has_floats)
01098 {
01099 for (int32_t q=0; q<4; q++)
01100 SG_PRINT( "child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q]) ;
01101 }
01102 if (TreeMem[node].has_seq)
01103 {
01104 for (int32_t q=0; q<16; q++)
01105 SG_PRINT( "seq[%i] = %i\n", q, TreeMem[node].seq[q]) ;
01106 }
01107 if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
01108 {
01109 for (int32_t q=0; q<4; q++)
01110 {
01111 if (TreeMem[node].children[q]!=NO_CHILD)
01112 {
01113 SG_PRINT( "children[%i] = %i -> \n", q, TreeMem[node].children[q]) ;
01114 display_node(abs(TreeMem[node].children[q])) ;
01115 }
01116 else
01117 SG_PRINT( "children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q]) ;
01118 }
01119
01120 }
01121
01122 delete[] trace ;
01123 #else
01124 SG_ERROR( "not implemented\n") ;
01125 #endif
01126 }
01127
01128
01129 template <class Trie> CTrie<Trie>::~CTrie()
01130 {
01131 destroy() ;
01132
01133 free(TreeMem) ;
01134 }
01135
01136 template <class Trie> void CTrie<Trie>::destroy()
01137 {
01138 if (trees!=NULL)
01139 {
01140 delete_trees();
01141 for (int32_t i=0; i<length; i++)
01142 trees[i] = NO_CHILD;
01143 delete[] trees;
01144
01145 TreeMemPtr=0;
01146 length=0;
01147 trees=NULL;
01148 }
01149 }
01150
01151 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
01152 {
01153 delete_trees(get_use_compact_terminal_nodes());
01154 degree=d;
01155 }
01156
01157 template <class Trie> void CTrie<Trie>::create(
01158 int32_t len, bool p_use_compact_terminal_nodes)
01159 {
01160 destroy();
01161
01162 trees=new int32_t[len] ;
01163 TreeMemPtr=0 ;
01164 for (int32_t i=0; i<len; i++)
01165 trees[i]=get_node(degree==1);
01166 length = len ;
01167
01168 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
01169 }
01170
01171
01172 template <class Trie> void CTrie<Trie>::delete_trees(
01173 bool p_use_compact_terminal_nodes)
01174 {
01175 if (trees==NULL)
01176 return;
01177
01178 TreeMemPtr=0 ;
01179 for (int32_t i=0; i<length; i++)
01180 trees[i]=get_node(degree==1);
01181
01182 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
01183 }
01184
01185 template <class Trie>
01186 float64_t CTrie<Trie>::compute_abs_weights_tree(int32_t tree, int32_t depth)
01187 {
01188 float64_t ret=0 ;
01189
01190 if (tree==NO_CHILD)
01191 return 0 ;
01192 TRIE_ASSERT(tree>=0) ;
01193
01194 if (depth==degree-2)
01195 {
01196 ret+=(TreeMem[tree].weight) ;
01197
01198 for (int32_t k=0; k<4; k++)
01199 ret+=(TreeMem[tree].child_weights[k]) ;
01200
01201 return ret ;
01202 }
01203
01204 ret+=(TreeMem[tree].weight) ;
01205
01206 for (int32_t i=0; i<4; i++)
01207 if (TreeMem[tree].children[i]!=NO_CHILD)
01208 ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
01209
01210 return ret ;
01211 }
01212
01213
01214 template <class Trie>
01215 float64_t *CTrie<Trie>::compute_abs_weights(int32_t &len)
01216 {
01217 float64_t * sum=new float64_t[length*4] ;
01218 for (int32_t i=0; i<length*4; i++)
01219 sum[i]=0 ;
01220 len=length ;
01221
01222 for (int32_t i=0; i<length; i++)
01223 {
01224 TRIE_ASSERT(trees[i]!=NO_CHILD) ;
01225 for (int32_t k=0; k<4; k++)
01226 {
01227 sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
01228 }
01229 }
01230
01231 return sum ;
01232 }
01233
01234 template <class Trie>
01235 void CTrie<Trie>::add_example_to_tree_mismatch_recursion(
01236 int32_t tree, int32_t i, float64_t alpha,
01237 int32_t *vec, int32_t len_rem,
01238 int32_t degree_rec, int32_t mismatch_rec,
01239 int32_t max_mismatch, float64_t * weights)
01240 {
01241 if (tree==NO_CHILD)
01242 tree=trees[i] ;
01243 TRIE_ASSERT(tree!=NO_CHILD) ;
01244
01245 if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
01246 return ;
01247 const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
01248
01249 int32_t subtree = NO_CHILD ;
01250
01251 if (degree_rec==degree-1)
01252 {
01253 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01254 if (weights_in_tree)
01255 TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
01256 else
01257 if (weights[degree_rec]!=0.0)
01258 TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
01259 if (mismatch_rec+1<=max_mismatch)
01260 for (int32_t o=0; o<3; o++)
01261 {
01262 if (weights_in_tree)
01263 TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
01264 else
01265 if (weights[degree_rec]!=0.0)
01266 TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
01267 }
01268 return ;
01269 }
01270 else
01271 {
01272 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01273 if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
01274 {
01275 subtree=TreeMem[tree].children[vec[0]] ;
01276 if (weights_in_tree)
01277 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
01278 else
01279 if (weights[degree_rec]!=0.0)
01280 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
01281 }
01282 else
01283 {
01284 int32_t tmp = get_node(degree_rec==degree-2);
01285 ASSERT(tmp>=0) ;
01286 TreeMem[tree].children[vec[0]]=tmp ;
01287 subtree=tmp ;
01288 #ifdef TRIE_CHECK_EVERYTHING
01289 if (degree_rec==degree-2)
01290 TreeMem[subtree].has_floats=true ;
01291 #endif
01292 if (weights_in_tree)
01293 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
01294 else
01295 if (weights[degree_rec]!=0.0)
01296 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
01297 else
01298 TreeMem[subtree].weight = 0.0 ;
01299 }
01300 add_example_to_tree_mismatch_recursion(subtree, i, alpha,
01301 &vec[1], len_rem-1,
01302 degree_rec+1, mismatch_rec, max_mismatch, weights) ;
01303
01304 if (mismatch_rec+1<=max_mismatch)
01305 {
01306 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01307 for (int32_t o=0; o<3; o++)
01308 {
01309 int32_t ot = other[vec[0]][o] ;
01310 if (TreeMem[tree].children[ot]!=NO_CHILD)
01311 {
01312 subtree=TreeMem[tree].children[ot] ;
01313 if (weights_in_tree)
01314 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
01315 else
01316 if (weights[degree_rec]!=0.0)
01317 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
01318 }
01319 else
01320 {
01321 int32_t tmp = get_node(degree_rec==degree-2);
01322 ASSERT(tmp>=0) ;
01323 TreeMem[tree].children[ot]=tmp ;
01324 subtree=tmp ;
01325 #ifdef TRIE_CHECK_EVERYTHING
01326 if (degree_rec==degree-2)
01327 TreeMem[subtree].has_floats=true ;
01328 #endif
01329
01330 if (weights_in_tree)
01331 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
01332 else
01333 if (weights[degree_rec]!=0.0)
01334 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
01335 else
01336 TreeMem[subtree].weight = 0.0 ;
01337 }
01338
01339 add_example_to_tree_mismatch_recursion(subtree, i, alpha,
01340 &vec[1], len_rem-1,
01341 degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
01342 }
01343 }
01344 }
01345 }
01346
01347 template <class Trie>
01348 void CTrie<Trie>::compute_scoring_helper(
01349 int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
01350 int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
01351 int32_t offs, float64_t* result)
01352 {
01353 if (i+j<num_feat)
01354 {
01355 float64_t decay=1.0;
01356
01357
01358
01359 if (j<degree-1)
01360 {
01361 for (int32_t k=0; k<num_sym; k++)
01362 {
01363 if (TreeMem[tree].children[k]!=NO_CHILD)
01364 {
01365 int32_t child=TreeMem[tree].children[k];
01366
01367 if (d<max_degree-1)
01368 compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
01369 else
01370 result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
01371
01373 if (d==0)
01374 compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
01375 }
01376 }
01377 }
01378 else if (j==degree-1)
01379 {
01380 for (int32_t k=0; k<num_sym; k++)
01381 {
01382
01383 if (d<max_degree-1 && i<num_feat-1)
01384 compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
01385 else
01386 result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
01387 }
01388 }
01389 }
01390 }
01391
01392 template <class Trie>
01393 void CTrie<Trie>::traverse(
01394 int32_t tree, const int32_t p, struct TreeParseInfo info,
01395 const int32_t depth, int32_t* const x, const int32_t k)
01396 {
01397 const int32_t num_sym = info.num_sym;
01398 const int32_t y0 = info.y0;
01399 const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
01400
01401
01402
01403
01404
01405 int32_t sym;
01406 ASSERT( depth < degree );
01407
01408 if (depth<degree-1)
01409 {
01410 for( sym=0; sym<num_sym; ++sym ) {
01411 const int32_t childNum = TreeMem[tree].children[ sym ];
01412 if( childNum != NO_CHILD ) {
01413 int32_t child = childNum ;
01414 x[depth] = sym;
01415 info.substrs[depth+1] = y0 + sym;
01416 info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
01417
01418 count( TreeMem[child].weight, depth, info, p, x, k );
01419 traverse( child, p, info, depth+1, x, k );
01420 x[depth] = -1;
01421 }
01422 }
01423 }
01424 else if( depth == degree-1 )
01425 {
01426 for( sym=0; sym<num_sym; ++sym ) {
01427 const float64_t w = TreeMem[tree].child_weights[ sym ];
01428 if( w != 0.0 ) {
01429 x[depth] = sym;
01430 info.substrs[depth+1] = y0 + sym;
01431 info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
01432
01433 count( w, depth, info, p, x, k );
01434 x[depth] = -1;
01435 }
01436 }
01437 }
01438
01439
01440 }
01441
01442 template <class Trie>
01443 void CTrie<Trie>::count(
01444 const float64_t w, const int32_t depth, const struct TreeParseInfo info,
01445 const int32_t p, int32_t* x, const int32_t k)
01446 {
01447 ASSERT( fabs(w) < 1e10 );
01448 ASSERT( x[depth] >= 0 );
01449 ASSERT( x[depth+1] < 0 );
01450 if ( depth < k ) {
01451 return;
01452 }
01453
01454 const int32_t nofKmers = info.nofsKmers[k];
01455 const float64_t margWeight = w * info.margFactors[ depth-k ];
01456 const int32_t m_a = depth - k + 1;
01457 const int32_t m_b = info.num_feat - p;
01458 const int32_t m = ( m_a < m_b ) ? m_a : m_b;
01459
01460 const int32_t offset0 = nofKmers * p;
01461 register int32_t i;
01462 register int32_t offset;
01463 offset = offset0;
01464 for( i = 0; i < m; ++i ) {
01465 const int32_t y = info.substrs[i+k+1];
01466 info.C_k[ y + offset ] += margWeight;
01467 offset += nofKmers;
01468 }
01469 if( depth > k ) {
01470
01471 const int32_t offsR = info.substrs[k+1] + offset0;
01472 info.R_k[offsR] += margWeight;
01473
01474 if( p+depth-k < info.num_feat ) {
01475 const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
01476 info.L_k[offsL] += margWeight;
01477 }
01478 }
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492
01493
01494 }
01495
01496 template <class Trie>
01497 void CTrie<Trie>::add_to_trie(
01498 int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
01499 float64_t *weights, bool degree_times_position_weights)
01500 {
01501 int32_t tree = trees[i] ;
01502
01503
01504 int32_t max_depth = 0 ;
01505 float64_t* weights_column ;
01506 if (degree_times_position_weights)
01507 weights_column = &weights[(i+seq_offset)*degree] ;
01508 else
01509 weights_column = weights ;
01510
01511 if (weights_in_tree)
01512 {
01513 for (int32_t j=0; (j<degree) && (i+j<length); j++)
01514 if (CMath::abs(weights_column[j]*alpha)>0)
01515 max_depth = j+1 ;
01516 }
01517 else
01518
01519 max_depth=degree ;
01520
01521 for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
01522 {
01523 TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4)) ;
01524 if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
01525 {
01526 if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
01527 {
01528
01529 TRIE_ASSERT(j >= degree-16) ;
01530
01531 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01532 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01533 int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
01534
01535 TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax)) ;
01536 TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq) ;
01537 TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats) ;
01538
01539
01540 int32_t mismatch_pos = -1 ;
01541 {
01542 int32_t k ;
01543 for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
01544 {
01545 TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
01546
01547 if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
01548 fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
01549 TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER)) ;
01550 TRIE_ASSERT(k<16) ;
01551 if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
01552 {
01553 mismatch_pos=k ;
01554 break ;
01555 }
01556 }
01557 }
01558
01559
01560
01561 if (mismatch_pos==-1)
01562
01563 TreeMem[node].weight+=alpha ;
01564 else
01565
01566
01567
01568
01569 {
01570
01571 int32_t last_node=tree ;
01572
01573
01574 int32_t k ;
01575 for (k=0; k<mismatch_pos; k++)
01576 {
01577 TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
01578 TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k]) ;
01579
01580 int32_t tmp=get_node();
01581 TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
01582 last_node=tmp ;
01583 if (weights_in_tree)
01584 TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
01585 else
01586 TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
01587 TRIE_ASSERT(j+k!=degree-1) ;
01588 }
01589 if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
01590 fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
01591 ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER)) ;
01592 TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos]) ;
01593
01594 if (j+k==degree-1)
01595 {
01596
01597
01598
01599 for (int32_t q=0; q<4; q++)
01600 TreeMem[last_node].child_weights[q]=0.0 ;
01601 if (weights_in_tree)
01602 {
01603 if (TreeMem[node].seq[mismatch_pos]<4)
01604 TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
01605 TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
01606 }
01607 else
01608 {
01609 if (TreeMem[node].seq[mismatch_pos]<4)
01610 TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
01611 TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
01612 }
01613
01614 #ifdef TRIE_CHECK_EVERYTHING
01615 TreeMem[last_node].has_floats=true ;
01616 #endif
01617 }
01618 else
01619 {
01620
01621 if (TreeMem[node].seq[mismatch_pos]<4)
01622 {
01623 TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
01624
01625
01626 for (int32_t q=0; q<16; q++)
01627 {
01628 if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
01629 TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
01630 else
01631 TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
01632 }
01633 #ifdef TRIE_CHECK_EVERYTHING
01634 TreeMem[node].has_seq=true ;
01635 #endif
01636 }
01637
01638
01639 TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4)) ;
01640 int32_t tmp = get_node() ;
01641 TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
01642 last_node=tmp ;
01643
01644 TreeMem[last_node].weight = alpha ;
01645 #ifdef TRIE_CHECK_EVERYTHING
01646 TreeMem[last_node].has_seq = true ;
01647 #endif
01648 memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
01649 for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
01650 TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
01651 }
01652 }
01653 break ;
01654 }
01655 else
01656 {
01657 tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
01658 TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
01659 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01660 if (weights_in_tree)
01661 TreeMem[tree].weight += alpha*weights_column[j];
01662 else
01663 TreeMem[tree].weight += alpha ;
01664 }
01665 }
01666 else if (j==degree-1)
01667 {
01668
01669 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01670 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01671 if (weights_in_tree)
01672 TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
01673 else
01674 TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
01675
01676 break;
01677 }
01678 else
01679 {
01680 bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
01681 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01682 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01683
01684 int32_t tmp = get_node((j==degree-2) && (!use_seq));
01685 if (use_seq)
01686 TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
01687 else
01688 TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
01689 tree=tmp ;
01690
01691 TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
01692 #ifdef TRIE_CHECK_EVERYTHING
01693 TreeMem[tree].has_seq = use_seq ;
01694 #endif
01695 if (use_seq)
01696 {
01697 TreeMem[tree].weight = alpha ;
01698
01699 memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
01700 for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
01701 {
01702 TRIE_ASSERT(q<16) ;
01703 TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
01704 }
01705 break ;
01706 }
01707 else
01708 {
01709 if (weights_in_tree)
01710 TreeMem[tree].weight = alpha*weights_column[j] ;
01711 else
01712 TreeMem[tree].weight = alpha ;
01713 #ifdef TRIE_CHECK_EVERYTHING
01714 if (j==degree-2)
01715 TreeMem[tree].has_floats = true ;
01716 #endif
01717 }
01718 }
01719 }
01720 }
01721
01722 template <class Trie>
01723 float64_t CTrie<Trie>::compute_by_tree_helper(
01724 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
01725 int32_t weight_pos, float64_t* weights,
01726 bool degree_times_position_weights)
01727 {
01728 int32_t tree = trees[tree_pos] ;
01729
01730 if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
01731 return 0.0;
01732
01733 float64_t *weights_column=NULL ;
01734 if (degree_times_position_weights)
01735 weights_column=&weights[weight_pos*degree] ;
01736 else
01737 weights_column=weights ;
01738
01739 float64_t sum=0 ;
01740 for (int32_t j=0; seq_pos+j < len; j++)
01741 {
01742 TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0)) ;
01743
01744 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01745 {
01746 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01747 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01748 {
01749 tree = - TreeMem[tree].children[vec[seq_pos+j]];
01750 TRIE_ASSERT(tree>=0) ;
01751 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01752 float64_t this_weight=0.0 ;
01753 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01754 {
01755 TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0)) ;
01756 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01757 break ;
01758 this_weight += weights_column[j+k] ;
01759 }
01760 sum += TreeMem[tree].weight * this_weight ;
01761 break ;
01762 }
01763 else
01764 {
01765 tree=TreeMem[tree].children[vec[seq_pos+j]];
01766 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01767 if (weights_in_tree)
01768 sum += TreeMem[tree].weight ;
01769 else
01770 sum += TreeMem[tree].weight * weights_column[j] ;
01771 } ;
01772 }
01773 else
01774 {
01775 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01776 if (j==degree-1)
01777 {
01778 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01779 if (weights_in_tree)
01780 sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01781 else
01782 sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
01783 }
01784 else
01785 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01786
01787 break;
01788 }
01789 }
01790
01791 if (position_weights!=NULL)
01792 return sum*position_weights[weight_pos] ;
01793 else
01794 return sum ;
01795 }
01796
01797 template <class Trie>
01798 void CTrie<Trie>::compute_by_tree_helper(
01799 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
01800 int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
01801 int32_t mkl_stepsize, float64_t * weights,
01802 bool degree_times_position_weights)
01803 {
01804 int32_t tree = trees[tree_pos] ;
01805 if (factor==0)
01806 return ;
01807
01808 if (position_weights!=NULL)
01809 {
01810 factor *= position_weights[weight_pos] ;
01811 if (factor==0)
01812 return ;
01813 if (!degree_times_position_weights)
01814 {
01815 for (int32_t j=0; seq_pos+j<len; j++)
01816 {
01817 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01818 {
01819 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01820 {
01821 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01822 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01823 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01824 {
01825 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01826 break ;
01827 if (weights_in_tree)
01828 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01829 else
01830 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
01831 }
01832 break ;
01833 }
01834 else
01835 {
01836 tree=TreeMem[tree].children[vec[seq_pos+j]];
01837 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01838 if (weights_in_tree)
01839 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01840 else
01841 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
01842 }
01843 }
01844 else
01845 {
01846 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01847 if (j==degree-1)
01848 {
01849 if (weights_in_tree)
01850 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01851 else
01852 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
01853 }
01854 }
01855 }
01856 }
01857 else
01858 {
01859 for (int32_t j=0; seq_pos+j<len; j++)
01860 {
01861 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01862 {
01863 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01864 {
01865 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01866 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01867 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01868 {
01869 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01870 break ;
01871 if (weights_in_tree)
01872 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01873 else
01874 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
01875 }
01876 break ;
01877 }
01878 else
01879 {
01880 tree=TreeMem[tree].children[vec[seq_pos+j]];
01881 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01882 if (weights_in_tree)
01883 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01884 else
01885 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
01886 }
01887 }
01888 else
01889 {
01890 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01891 if (j==degree-1)
01892 {
01893 if (weights_in_tree)
01894 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01895 else
01896 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
01897 }
01898 break ;
01899 }
01900 }
01901 }
01902 }
01903 else if (!degree_times_position_weights)
01904 {
01905 for (int32_t j=0; seq_pos+j<len; j++)
01906 {
01907 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01908 {
01909 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01910 {
01911 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01912 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01913 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01914 {
01915 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01916 break ;
01917 if (weights_in_tree)
01918 LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
01919 else
01920 LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
01921 }
01922 break ;
01923 }
01924 else
01925 {
01926 tree=TreeMem[tree].children[vec[seq_pos+j]];
01927 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01928 if (weights_in_tree)
01929 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
01930 else
01931 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
01932 }
01933 }
01934 else
01935 {
01936 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01937 if (j==degree-1)
01938 {
01939 if (weights_in_tree)
01940 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01941 else
01942 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
01943 }
01944 break ;
01945 }
01946 }
01947 }
01948 else
01949 {
01950
01951
01952
01953
01954
01955
01956
01957
01958
01959
01960
01961
01962
01963
01964
01965
01966
01967
01968 for (int32_t j=0; seq_pos+j<len; j++)
01969 {
01970 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01971 {
01972 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01973 {
01974 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01975 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01976 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01977 {
01978 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01979 break ;
01980 if (weights_in_tree)
01981 LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
01982 else
01983 LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
01984 }
01985 break ;
01986 }
01987 else
01988 {
01989 tree=TreeMem[tree].children[vec[seq_pos+j]];
01990 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01991 if (weights_in_tree)
01992 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
01993 else
01994 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
01995 }
01996 }
01997 else
01998 {
01999 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
02000 if (j==degree-1)
02001 {
02002 if (weights_in_tree)
02003 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
02004 else
02005 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
02006 }
02007 break ;
02008 }
02009 }
02010 }
02011 }
02012
02013 template <class Trie>
02014 void CTrie<Trie>::fill_backtracking_table_recursion(
02015 Trie* tree, int32_t depth, uint64_t seq, float64_t value,
02016 DynArray<ConsensusEntry>* table, float64_t* weights)
02017 {
02018 float64_t w=1.0;
02019
02020 if (weights_in_tree || depth==0)
02021 value+=tree->weight;
02022 else
02023 {
02024 w=weights[degree-1];
02025 value+=weights[depth-1]*tree->weight;
02026 }
02027
02028 if (degree-1==depth)
02029 {
02030 for (int32_t sym=0; sym<4; sym++)
02031 {
02032 float64_t v=w*tree->child_weights[sym];
02033 if (v!=0.0)
02034 {
02035 ConsensusEntry entry;
02036 entry.bt=-1;
02037 entry.score=value+v;
02038 entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
02039
02040 table->append_element(entry);
02041 }
02042 }
02043 }
02044 else
02045 {
02046 for (int32_t sym=0; sym<4; sym++)
02047 {
02048 uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
02049 if (tree->children[sym] != NO_CHILD)
02050 fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
02051 }
02052 }
02053 }
02054
02055 template <class Trie>
02056 float64_t CTrie<Trie>::get_cumulative_score(
02057 int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
02058 {
02059 float64_t result=0.0;
02060
02061
02062
02063 for (int32_t i=pos; i<pos+deg && i<length; i++)
02064 {
02065
02066 Trie* tree = &TreeMem[trees[i]];
02067
02068 for (int32_t d=0; d<deg-i+pos; d++)
02069 {
02070
02071 ASSERT(d-1<degree);
02072 int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
02073
02074 float64_t w=1.0;
02075 if (!weights_in_tree)
02076 w=weights[d];
02077
02078 ASSERT(tree->children[sym] != NO_CHILD);
02079 tree=&TreeMem[tree->children[sym]];
02080 result+=w*tree->weight;
02081 }
02082 }
02083
02084 return result;
02085 }
02086
02087 template <class Trie>
02088 void CTrie<Trie>::fill_backtracking_table(
02089 int32_t pos, DynArray<ConsensusEntry>* prev,
02090 DynArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
02091 {
02092 ASSERT(pos>=0 && pos<length);
02093 ASSERT(!use_compact_terminal_nodes);
02094
02095 Trie* t = &TreeMem[trees[pos]];
02096
02097 fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
02098
02099
02100 if (cumulative)
02101 {
02102 int32_t num_cur=cur->get_num_elements();
02103 for (int32_t i=0; i<num_cur; i++)
02104 {
02105 ConsensusEntry entry=cur->get_element(i);
02106 entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
02107 cur->set_element(entry,i);
02108
02109 }
02110 }
02111
02112
02113
02114 if (prev)
02115 {
02116 int32_t num_cur=cur->get_num_elements();
02117 int32_t num_prev=prev->get_num_elements();
02118
02119 for (int32_t i=0; i<num_cur; i++)
02120 {
02121
02122 uint64_t str_cur= cur->get_element(i).string >> 2;
02123
02124
02125 int32_t bt=-1;
02126 float64_t max_score=0.0;
02127
02128 for (int32_t j=0; j<num_prev; j++)
02129 {
02130
02131 uint64_t mask=
02132 ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
02133 uint64_t str_prev= mask & prev->get_element(j).string;
02134
02135
02136 if (str_cur == str_prev)
02137 {
02138 float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
02139 if (bt==-1 || sc>max_score)
02140 {
02141 bt=j;
02142 max_score=sc;
02143
02144
02145 }
02146 }
02147 }
02148
02149 ASSERT(bt!=-1);
02150 ConsensusEntry entry;
02151 entry.bt=bt;
02152 entry.score=max_score;
02153 entry.string=cur->get_element(i).string;
02154 cur->set_element(entry, i);
02155
02156 }
02157 }
02158 }
02159 }
02160 #endif // _TRIE_H___