00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifMatrix.h"
00022 #include "structure/PlifBase.h"
00023 #include "structure/Plif.h"
00024 #include "structure/IntronList.h"
00025 #include "structure/SegmentLoss.h"
00026 #include "features/StringFeatures.h"
00027 #include "features/SparseFeatures.h"
00028 #include "distributions/Distribution.h"
00029 #include "lib/DynamicArray.h"
00030 #include "lib/Array.h"
00031 #include "lib/Array2.h"
00032 #include "lib/Array3.h"
00033 #include "lib/Time.h"
00034
00035 #include <stdio.h>
00036 #include <limits.h>
00037
00038 namespace shogun
00039 {
00040 template <class T> class CSparseFeatures;
00041 class CIntronList;
00042 class CPlifMatrix;
00043 class CSegmentLoss;
00044 template <class T> class CArray;
00045
00046
00047
00048 #ifdef USE_BIGSTATES
00049 typedef uint16_t T_STATES ;
00050 #else
00051 typedef uint8_t T_STATES ;
00052 #endif
00053 typedef T_STATES* P_STATES ;
00054
00056 struct segment_loss_struct
00057 {
00059 int32_t maxlookback;
00061 int32_t seqlen;
00063 int32_t *segments_changed;
00065 float64_t *num_segment_id;
00067 int32_t *length_segment_id ;
00068 };
00069
00075 class CDynProg : public CSGObject
00076 {
00077 public:
00082 CDynProg(int32_t p_num_svms=8);
00083 virtual ~CDynProg();
00084
00085
00091 void set_num_states(int32_t N);
00092
00094 int32_t get_num_states();
00095
00097 int32_t get_num_svms();
00098
00104 void init_content_svm_value_array(const int32_t p_num_svms);
00105
00113 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00114
00121 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
00122
00127 void resize_lin_feat(int32_t num_new_feat);
00133 void set_p_vector(float64_t* p, int32_t N);
00134
00140 void set_q_vector(float64_t* q, int32_t N);
00141
00148 void set_a(float64_t* a, int32_t M, int32_t N);
00149
00156 void set_a_id(int32_t *a, int32_t M, int32_t N);
00157
00164 void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00165
00172 void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00173
00179 bool check_svm_arrays();
00180
00187 void set_observation_matrix(float64_t* seq, int32_t* dims, int32_t ndims);
00188
00195 int32_t get_num_positions();
00196
00208 void set_content_type_array(float64_t* seg_path, int32_t rows, int32_t cols);
00209
00215 void set_pos(int32_t* pos, int32_t seq_len);
00216
00224 void set_orf_info(int32_t* orf_info, int32_t m, int32_t n);
00225
00231 void set_gene_string(char* genestr, int32_t genestr_len);
00232
00233
00240 void set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00241
00248 void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00249
00256 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00257
00259 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00260
00265 void set_plif_matrices(CPlifMatrix* pm);
00266
00267
00273 void get_scores(float64_t **scores, int32_t *n);
00274
00281 void get_states(int32_t **states, int32_t *m, int32_t *n);
00282
00289 void get_positions(int32_t **positions, int32_t *m, int32_t *n);
00290
00291
00300 void compute_nbest_paths(int32_t max_num_signals,
00301 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00302
00304
00316 void best_path_trans_deriv(
00317 int32_t* my_state_seq, int32_t *my_pos_seq,
00318 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00319
00320
00325 void set_my_state_seq(int32_t* my_state_seq);
00326
00331 void set_my_pos_seq(int32_t* my_pos_seq);
00332
00340 void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00341
00349 void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00350
00351
00353 inline T_STATES get_N() const
00354 {
00355 return m_N ;
00356 }
00357
00362 inline void set_q(T_STATES offset, float64_t value)
00363 {
00364 m_end_state_distribution_q[offset]=value;
00365 }
00366
00371 inline void set_p(T_STATES offset, float64_t value)
00372 {
00373 m_initial_state_distribution_p[offset]=value;
00374 }
00375
00382 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00383 {
00384 m_transition_matrix_a.element(line_,column)=value;
00385 }
00386
00392 inline float64_t get_q(T_STATES offset) const
00393 {
00394 return m_end_state_distribution_q[offset];
00395 }
00396
00402 inline float64_t get_q_deriv(T_STATES offset) const
00403 {
00404 return m_end_state_distribution_q_deriv[offset];
00405 }
00406
00412 inline float64_t get_p(T_STATES offset) const
00413 {
00414 return m_initial_state_distribution_p[offset];
00415 }
00416
00422 inline float64_t get_p_deriv(T_STATES offset) const
00423 {
00424 return m_initial_state_distribution_p_deriv[offset];
00425 }
00426
00430 void precompute_content_values();
00431
00438 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00439 {
00440 m_lin_feat.get_array_size(dim1, dim2);
00441 return m_lin_feat.get_array();
00442 }
00451 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00452 {
00453 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00454 }
00459 void create_word_string();
00460
00463 void precompute_stop_codons();
00464
00471 inline float64_t get_a(T_STATES line_, T_STATES column) const
00472 {
00473 return m_transition_matrix_a.element(line_, column);
00474 }
00475
00482 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00483 {
00484 return m_transition_matrix_a_deriv.element(line_, column);
00485 }
00487
00492 void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00493
00495 CSegmentLoss* get_segment_loss_object()
00496 {
00497 return m_seg_loss_obj;
00498 }
00499
00506 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00507 {
00508 m_long_transitions = use_long_transitions;
00509 m_long_transition_threshold = threshold;
00510 SG_DEBUG("ignoring max_len\n") ;
00511
00512 }
00513
00514 protected:
00515
00516
00517
00527 void lookup_content_svm_values(const int32_t from_state,
00528 const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00529 float64_t* svm_values, int32_t frame);
00530
00538 inline void lookup_tiling_plif_values(const int32_t from_state,
00539 const int32_t to_state, const int32_t len, float64_t* svm_values);
00540
00545 inline int32_t find_frame(const int32_t from_state);
00546
00555 inline int32_t raw_intensities_interval_query(
00556 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00557
00558 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00559
00560 struct svm_values_struct
00561 {
00563 int32_t maxlookback;
00565 int32_t seqlen;
00566
00568 int32_t* start_pos;
00570 float64_t ** svm_values_unnormalized;
00572 float64_t * svm_values;
00574 bool *** word_used;
00576 int32_t **num_unique_words;
00577 };
00578 #endif // DOXYGEN_SHOULD_SKIP_THIS
00579
00588 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00589
00591 inline virtual const char* get_name() const { return "DynProg"; }
00592
00593 private:
00594
00595 T_STATES trans_list_len;
00596 T_STATES **trans_list_forward;
00597 T_STATES *trans_list_forward_cnt;
00598 float64_t **trans_list_forward_val;
00599 int32_t **trans_list_forward_id;
00600 bool mem_initialized;
00601
00602 #ifdef DYNPROG_TIMING
00603 CTime MyTime;
00604 CTime MyTime2;
00605 CTime MyTime3;
00606
00607 float64_t segment_init_time;
00608 float64_t segment_pos_time;
00609 float64_t segment_clean_time;
00610 float64_t segment_extend_time;
00611 float64_t orf_time;
00612 float64_t content_time;
00613 float64_t content_penalty_time;
00614 float64_t content_svm_values_time ;
00615 float64_t content_plifs_time ;
00616 float64_t svm_init_time;
00617 float64_t svm_pos_time;
00618 float64_t inner_loop_time;
00619 float64_t inner_loop_max_time ;
00620 float64_t svm_clean_time;
00621 float64_t long_transition_time ;
00622 #endif
00623
00624
00625 protected:
00630
00631 int32_t m_N;
00632
00634 CArray2<int32_t> m_transition_matrix_a_id;
00635 CArray2<float64_t> m_transition_matrix_a;
00636 CArray2<float64_t> m_transition_matrix_a_deriv;
00637
00639 CArray<float64_t> m_initial_state_distribution_p;
00640 CArray<float64_t> m_initial_state_distribution_p_deriv;
00641
00643 CArray<float64_t> m_end_state_distribution_q;
00644 CArray<float64_t> m_end_state_distribution_q_deriv;
00645
00647
00649 int32_t m_num_degrees;
00651 int32_t m_num_svms;
00652
00654 CArray<int32_t> m_word_degree;
00656 CArray<int32_t> m_cum_num_words;
00658 int32_t * m_cum_num_words_array;
00660 CArray<int32_t> m_num_words;
00662 int32_t* m_num_words_array;
00664 CArray2<int32_t> m_mod_words;
00666 int32_t* m_mod_words_array;
00668 CArray<bool> m_sign_words;
00670 bool* m_sign_words_array;
00672 CArray<int32_t> m_string_words;
00674 int32_t* m_string_words_array;
00675
00677
00679 CArray<int32_t> m_num_unique_words;
00681 bool m_svm_arrays_clean;
00683 int32_t m_max_a_id;
00684
00685
00687 CArray3<float64_t> m_observation_matrix;
00689 CArray<int32_t> m_pos;
00691 int32_t m_seq_len;
00693 CArray2<int32_t> m_orf_info;
00695 CArray2<float64_t> m_segment_sum_weights;
00697 CArray<CPlifBase*> m_plif_list;
00699 CArray2<CPlifBase*> m_PEN;
00701 CArray2<CPlifBase*> m_PEN_state_signals;
00703 CArray<char> m_genestr;
00718 uint16_t*** m_wordstr;
00720 CArray2<float64_t> m_dict_weights;
00722 CArray3<float64_t> m_segment_loss;
00724 CArray<int32_t> m_segment_ids;
00726 CArray<float64_t> m_segment_mask;
00728 CArray<int32_t> m_my_state_seq;
00730 CArray<int32_t> m_my_pos_seq;
00732 CArray<float64_t> m_my_scores;
00734 CArray<float64_t> m_my_losses;
00735
00738 CSegmentLoss* m_seg_loss_obj;
00739
00740
00742 CArray<float64_t> m_scores;
00744 CArray2<int32_t> m_states;
00746 CArray2<int32_t> m_positions;
00747
00749 CSparseFeatures<float64_t>* m_seq_sparse1;
00751 CSparseFeatures<float64_t>* m_seq_sparse2;
00753 CPlifMatrix* m_plif_matrices;
00754
00758 CArray<bool> m_genestr_stop;
00759
00762 CIntronList* m_intron_list;
00763
00765 int32_t m_num_intron_plifs;
00766
00771 CArray2<float64_t> m_lin_feat;
00772
00774 float64_t *m_raw_intensities;
00776 int32_t* m_probe_pos;
00778 int32_t* m_num_probes_cum;
00780 int32_t* m_num_lin_feat_plifs_cum;
00782 int32_t m_num_raw_data;
00783
00785 bool m_long_transitions ;
00788 int32_t m_long_transition_threshold ;
00793
00794
00798 static int32_t word_degree_default[4];
00799
00803 static int32_t cum_num_words_default[5];
00804
00807 static int32_t frame_plifs[3];
00808
00811 static int32_t num_words_default[4];
00812
00814 static int32_t mod_words_default[32];
00815
00817 static bool sign_words_default[16];
00818
00820 static int32_t string_words_default[16];
00821 };
00822 }
00823 #endif