00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/base/SGObject.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/lib/config.h>
00021 #include <shogun/structure/PlifMatrix.h>
00022 #include <shogun/structure/PlifBase.h>
00023 #include <shogun/structure/Plif.h>
00024 #include <shogun/structure/IntronList.h>
00025 #include <shogun/structure/SegmentLoss.h>
00026 #include <shogun/features/StringFeatures.h>
00027 #include <shogun/features/SparseFeatures.h>
00028 #include <shogun/distributions/Distribution.h>
00029 #include <shogun/lib/DynamicArray.h>
00030 #include <shogun/lib/Array.h>
00031 #include <shogun/lib/Array2.h>
00032 #include <shogun/lib/Array3.h>
00033 #include <shogun/lib/Time.h>
00034
00035 #include <stdio.h>
00036 #include <limits.h>
00037
00038 namespace shogun
00039 {
00040 template <class T> class CSparseFeatures;
00041 class CIntronList;
00042 class CPlifMatrix;
00043 class CSegmentLoss;
00044 template <class T> class CArray;
00045
00046
00047
00048 #ifdef USE_BIGSTATES
00049 typedef uint16_t T_STATES ;
00050 #else
00051 typedef uint8_t T_STATES ;
00052 #endif
00053 typedef T_STATES* P_STATES ;
00054
00055 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00056
00057 struct segment_loss_struct
00058 {
00060 int32_t maxlookback;
00062 int32_t seqlen;
00064 int32_t *segments_changed;
00066 float64_t *num_segment_id;
00068 int32_t *length_segment_id ;
00069 };
00070 #endif
00071
00077 class CDynProg : public CSGObject
00078 {
00079 public:
00084 CDynProg(int32_t p_num_svms=8);
00085 virtual ~CDynProg();
00086
00087
00093 void set_num_states(int32_t N);
00094
00096 int32_t get_num_states();
00097
00099 int32_t get_num_svms();
00100
00106 void init_content_svm_value_array(const int32_t p_num_svms);
00107
00115 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00116
00123 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
00124
00129 void resize_lin_feat(int32_t num_new_feat);
00134 void set_p_vector(SGVector<float64_t> p);
00135
00140 void set_q_vector(SGVector<float64_t> q);
00141
00146 void set_a(SGMatrix<float64_t> a);
00147
00152 void set_a_id(SGMatrix<int32_t> a);
00153
00158 void set_a_trans_matrix(SGMatrix<float64_t> a_trans);
00159
00164 void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array);
00165
00171 bool check_svm_arrays();
00172
00177 void set_observation_matrix(SGNDArray<float64_t> seq);
00178
00185 int32_t get_num_positions();
00186
00196 void set_content_type_array(SGMatrix<float64_t> seg_path);
00197
00202 void set_pos(SGVector<int32_t> pos);
00203
00209 void set_orf_info(SGMatrix<int32_t> orf_info);
00210
00215 void set_gene_string(SGVector<char> genestr);
00216
00217
00222 void set_dict_weights(SGMatrix<float64_t> dictionary_weights);
00223
00228 void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss);
00229
00236 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00237
00239 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00240
00245 void set_plif_matrices(CPlifMatrix* pm);
00246
00247
00252 SGVector<float64_t> get_scores();
00253
00258 SGMatrix<int32_t> get_states();
00259
00264 SGMatrix<int32_t> get_positions();
00265
00266
00275 void compute_nbest_paths(int32_t max_num_signals,
00276 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00277
00279
00291 void best_path_trans_deriv(
00292 int32_t* my_state_seq, int32_t *my_pos_seq,
00293 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00294
00295
00300 void set_my_state_seq(int32_t* my_state_seq);
00301
00306 void set_my_pos_seq(int32_t* my_pos_seq);
00307
00315 void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00316
00324 void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00325
00326
00328 inline T_STATES get_N() const
00329 {
00330 return m_N ;
00331 }
00332
00337 inline void set_q(T_STATES offset, float64_t value)
00338 {
00339 m_end_state_distribution_q[offset]=value;
00340 }
00341
00346 inline void set_p(T_STATES offset, float64_t value)
00347 {
00348 m_initial_state_distribution_p[offset]=value;
00349 }
00350
00357 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00358 {
00359 m_transition_matrix_a.element(line_,column)=value;
00360 }
00361
00367 inline float64_t get_q(T_STATES offset) const
00368 {
00369 return m_end_state_distribution_q[offset];
00370 }
00371
00377 inline float64_t get_q_deriv(T_STATES offset) const
00378 {
00379 return m_end_state_distribution_q_deriv[offset];
00380 }
00381
00387 inline float64_t get_p(T_STATES offset) const
00388 {
00389 return m_initial_state_distribution_p[offset];
00390 }
00391
00397 inline float64_t get_p_deriv(T_STATES offset) const
00398 {
00399 return m_initial_state_distribution_p_deriv[offset];
00400 }
00401
00405 void precompute_content_values();
00406
00413 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00414 {
00415 m_lin_feat.get_array_size(dim1, dim2);
00416 return m_lin_feat.get_array();
00417 }
00426 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00427 {
00428 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00429 }
00434 void create_word_string();
00435
00438 void precompute_stop_codons();
00439
00446 inline float64_t get_a(T_STATES line_, T_STATES column) const
00447 {
00448 return m_transition_matrix_a.element(line_, column);
00449 }
00450
00457 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00458 {
00459 return m_transition_matrix_a_deriv.element(line_, column);
00460 }
00462
00467 void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00468
00470 CSegmentLoss* get_segment_loss_object()
00471 {
00472 return m_seg_loss_obj;
00473 }
00474
00481 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00482 {
00483 m_long_transitions = use_long_transitions;
00484 m_long_transition_threshold = threshold;
00485 SG_DEBUG("ignoring max_len\n") ;
00486
00487 }
00488
00489 protected:
00490
00491
00492
00502 void lookup_content_svm_values(const int32_t from_state,
00503 const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00504 float64_t* svm_values, int32_t frame);
00505
00513 inline void lookup_tiling_plif_values(const int32_t from_state,
00514 const int32_t to_state, const int32_t len, float64_t* svm_values);
00515
00520 inline int32_t find_frame(const int32_t from_state);
00521
00530 inline int32_t raw_intensities_interval_query(
00531 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00532
00533 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00534
00535 struct svm_values_struct
00536 {
00538 int32_t maxlookback;
00540 int32_t seqlen;
00541
00543 int32_t* start_pos;
00545 float64_t ** svm_values_unnormalized;
00547 float64_t * svm_values;
00549 bool *** word_used;
00551 int32_t **num_unique_words;
00552 };
00553 #endif // DOXYGEN_SHOULD_SKIP_THIS
00554
00563 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00564
00566 inline virtual const char* get_name() const { return "DynProg"; }
00567
00568 private:
00569
00570 T_STATES trans_list_len;
00571 T_STATES **trans_list_forward;
00572 T_STATES *trans_list_forward_cnt;
00573 float64_t **trans_list_forward_val;
00574 int32_t **trans_list_forward_id;
00575 bool mem_initialized;
00576
00577 #ifdef DYNPROG_TIMING
00578 CTime MyTime;
00579 CTime MyTime2;
00580 CTime MyTime3;
00581
00582 float64_t segment_init_time;
00583 float64_t segment_pos_time;
00584 float64_t segment_clean_time;
00585 float64_t segment_extend_time;
00586 float64_t orf_time;
00587 float64_t content_time;
00588 float64_t content_penalty_time;
00589 float64_t content_svm_values_time ;
00590 float64_t content_plifs_time ;
00591 float64_t svm_init_time;
00592 float64_t svm_pos_time;
00593 float64_t inner_loop_time;
00594 float64_t inner_loop_max_time ;
00595 float64_t svm_clean_time;
00596 float64_t long_transition_time ;
00597 #endif
00598
00599
00600 protected:
00605
00606 int32_t m_N;
00607
00609 CArray2<int32_t> m_transition_matrix_a_id;
00610 CArray2<float64_t> m_transition_matrix_a;
00611 CArray2<float64_t> m_transition_matrix_a_deriv;
00612
00614 CArray<float64_t> m_initial_state_distribution_p;
00615 CArray<float64_t> m_initial_state_distribution_p_deriv;
00616
00618 CArray<float64_t> m_end_state_distribution_q;
00619 CArray<float64_t> m_end_state_distribution_q_deriv;
00620
00622
00624 int32_t m_num_degrees;
00626 int32_t m_num_svms;
00627
00629 CArray<int32_t> m_word_degree;
00631 CArray<int32_t> m_cum_num_words;
00633 int32_t * m_cum_num_words_array;
00635 CArray<int32_t> m_num_words;
00637 int32_t* m_num_words_array;
00639 CArray2<int32_t> m_mod_words;
00641 int32_t* m_mod_words_array;
00643 CArray<bool> m_sign_words;
00645 bool* m_sign_words_array;
00647 CArray<int32_t> m_string_words;
00649 int32_t* m_string_words_array;
00650
00652
00654 CArray<int32_t> m_num_unique_words;
00656 bool m_svm_arrays_clean;
00658 int32_t m_max_a_id;
00659
00660
00662 CArray3<float64_t> m_observation_matrix;
00664 CArray<int32_t> m_pos;
00666 int32_t m_seq_len;
00668 CArray2<int32_t> m_orf_info;
00670 CArray2<float64_t> m_segment_sum_weights;
00672 CArray<CPlifBase*> m_plif_list;
00674 CArray2<CPlifBase*> m_PEN;
00676 CArray2<CPlifBase*> m_PEN_state_signals;
00678 CArray<char> m_genestr;
00693 uint16_t*** m_wordstr;
00695 CArray2<float64_t> m_dict_weights;
00697 CArray3<float64_t> m_segment_loss;
00699 CArray<int32_t> m_segment_ids;
00701 CArray<float64_t> m_segment_mask;
00703 CArray<int32_t> m_my_state_seq;
00705 CArray<int32_t> m_my_pos_seq;
00707 CArray<float64_t> m_my_scores;
00709 CArray<float64_t> m_my_losses;
00710
00713 CSegmentLoss* m_seg_loss_obj;
00714
00715
00717 CArray<float64_t> m_scores;
00719 CArray2<int32_t> m_states;
00721 CArray2<int32_t> m_positions;
00722
00724 CSparseFeatures<float64_t>* m_seq_sparse1;
00726 CSparseFeatures<float64_t>* m_seq_sparse2;
00728 CPlifMatrix* m_plif_matrices;
00729
00733 CArray<bool> m_genestr_stop;
00734
00737 CIntronList* m_intron_list;
00738
00740 int32_t m_num_intron_plifs;
00741
00746 CArray2<float64_t> m_lin_feat;
00747
00749 float64_t *m_raw_intensities;
00751 int32_t* m_probe_pos;
00753 int32_t* m_num_probes_cum;
00755 int32_t* m_num_lin_feat_plifs_cum;
00757 int32_t m_num_raw_data;
00758
00760 bool m_long_transitions ;
00763 int32_t m_long_transition_threshold ;
00768
00769
00773 static int32_t word_degree_default[4];
00774
00778 static int32_t cum_num_words_default[5];
00779
00782 static int32_t frame_plifs[3];
00783
00786 static int32_t num_words_default[4];
00787
00789 static int32_t mod_words_default[32];
00790
00792 static bool sign_words_default[16];
00793
00795 static int32_t string_words_default[16];
00796 };
00797 }
00798 #endif