00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/base/SGObject.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/lib/config.h>
00021 #include <shogun/structure/PlifMatrix.h>
00022 #include <shogun/structure/PlifBase.h>
00023 #include <shogun/structure/Plif.h>
00024 #include <shogun/structure/IntronList.h>
00025 #include <shogun/structure/SegmentLoss.h>
00026 #include <shogun/features/StringFeatures.h>
00027 #include <shogun/features/SparseFeatures.h>
00028 #include <shogun/distributions/Distribution.h>
00029 #include <shogun/lib/DynamicArray.h>
00030 #include <shogun/lib/DynamicObjectArray.h>
00031 #include <shogun/lib/Time.h>
00032
00033 #include <stdio.h>
00034 #include <limits.h>
00035
00036 namespace shogun
00037 {
00038 template <class T> class CSparseFeatures;
00039 class CIntronList;
00040 class CPlifMatrix;
00041 class CSegmentLoss;
00042
00043 template <class T> class CDynamicArray;
00044
00045
00046
00047 #ifdef USE_BIGSTATES
00048 typedef uint16_t T_STATES ;
00049 #else
00050 typedef uint8_t T_STATES ;
00051 #endif
00052 typedef T_STATES* P_STATES ;
00053
00054 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00055
00056 struct segment_loss_struct
00057 {
00059 int32_t maxlookback;
00061 int32_t seqlen;
00063 int32_t *segments_changed;
00065 float64_t *num_segment_id;
00067 int32_t *length_segment_id ;
00068 };
00069 #endif
00070
00076 class CDynProg : public CSGObject
00077 {
00078 public:
00083 CDynProg(int32_t p_num_svms=8);
00084 virtual ~CDynProg();
00085
00086
00092 void set_num_states(int32_t N);
00093
00095 int32_t get_num_states();
00096
00098 int32_t get_num_svms();
00099
00105 void init_content_svm_value_array(const int32_t p_num_svms);
00106
00114 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00115
00122 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
00123
00128 void resize_lin_feat(int32_t num_new_feat);
00133 void set_p_vector(SGVector<float64_t> p);
00134
00139 void set_q_vector(SGVector<float64_t> q);
00140
00145 void set_a(SGMatrix<float64_t> a);
00146
00151 void set_a_id(SGMatrix<int32_t> a);
00152
00157 void set_a_trans_matrix(SGMatrix<float64_t> a_trans);
00158
00163 void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array);
00164
00170 bool check_svm_arrays();
00171
00176 void set_observation_matrix(SGNDArray<float64_t> seq);
00177
00184 int32_t get_num_positions();
00185
00195 void set_content_type_array(SGMatrix<float64_t> seg_path);
00196
00201 void set_pos(SGVector<int32_t> pos);
00202
00208 void set_orf_info(SGMatrix<int32_t> orf_info);
00209
00214 void set_gene_string(SGVector<char> genestr);
00215
00216
00221 void set_dict_weights(SGMatrix<float64_t> dictionary_weights);
00222
00227 void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss);
00228
00235 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00236
00238 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00239
00244 void set_plif_matrices(CPlifMatrix* pm);
00245
00246
00251 SGVector<float64_t> get_scores();
00252
00257 SGMatrix<int32_t> get_states();
00258
00263 SGMatrix<int32_t> get_positions();
00264
00265
00274 void compute_nbest_paths(int32_t max_num_signals,
00275 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00276
00278
00290 void best_path_trans_deriv(
00291 int32_t* my_state_seq, int32_t *my_pos_seq,
00292 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00293
00294
00299 void set_my_state_seq(int32_t* my_state_seq);
00300
00305 void set_my_pos_seq(int32_t* my_pos_seq);
00306
00314 void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00315
00323 void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00324
00325
00327 inline T_STATES get_N() const
00328 {
00329 return m_N ;
00330 }
00331
00336 inline void set_q(T_STATES offset, float64_t value)
00337 {
00338 m_end_state_distribution_q[offset]=value;
00339 }
00340
00345 inline void set_p(T_STATES offset, float64_t value)
00346 {
00347 m_initial_state_distribution_p[offset]=value;
00348 }
00349
00356 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00357 {
00358 m_transition_matrix_a.element(line_,column)=value;
00359 }
00360
00366 inline float64_t get_q(T_STATES offset) const
00367 {
00368 return m_end_state_distribution_q[offset];
00369 }
00370
00376 inline float64_t get_q_deriv(T_STATES offset) const
00377 {
00378 return m_end_state_distribution_q_deriv[offset];
00379 }
00380
00386 inline float64_t get_p(T_STATES offset) const
00387 {
00388 return m_initial_state_distribution_p[offset];
00389 }
00390
00396 inline float64_t get_p_deriv(T_STATES offset) const
00397 {
00398 return m_initial_state_distribution_p_deriv[offset];
00399 }
00400
00404 void precompute_content_values();
00405
00412 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00413 {
00414 m_lin_feat.get_array_size(dim1, dim2);
00415 return m_lin_feat.get_array();
00416 }
00425 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00426 {
00427 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00428 }
00433 void create_word_string();
00434
00437 void precompute_stop_codons();
00438
00445 inline float64_t get_a(T_STATES line_, T_STATES column) const
00446 {
00447 return m_transition_matrix_a.element(line_, column);
00448 }
00449
00456 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00457 {
00458 return m_transition_matrix_a_deriv.element(line_, column);
00459 }
00461
00466 void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00467
00469 CSegmentLoss* get_segment_loss_object()
00470 {
00471 return m_seg_loss_obj;
00472 }
00473
00480 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00481 {
00482 m_long_transitions = use_long_transitions;
00483 m_long_transition_threshold = threshold;
00484 SG_DEBUG("ignoring max_len\n") ;
00485
00486 }
00487
00488 protected:
00489
00490
00491
00501 void lookup_content_svm_values(const int32_t from_state,
00502 const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00503 float64_t* svm_values, int32_t frame);
00504
00512 inline void lookup_tiling_plif_values(const int32_t from_state,
00513 const int32_t to_state, const int32_t len, float64_t* svm_values);
00514
00519 inline int32_t find_frame(const int32_t from_state);
00520
00529 inline int32_t raw_intensities_interval_query(
00530 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00531
00532 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00533
00534 struct svm_values_struct
00535 {
00537 int32_t maxlookback;
00539 int32_t seqlen;
00540
00542 int32_t* start_pos;
00544 float64_t ** svm_values_unnormalized;
00546 float64_t * svm_values;
00548 bool *** word_used;
00550 int32_t **num_unique_words;
00551 };
00552 #endif // DOXYGEN_SHOULD_SKIP_THIS
00553
00562 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00563
00565 virtual const char* get_name() const { return "DynProg"; }
00566
00567 private:
00568
00569 T_STATES trans_list_len;
00570 T_STATES **trans_list_forward;
00571 T_STATES *trans_list_forward_cnt;
00572 float64_t **trans_list_forward_val;
00573 int32_t **trans_list_forward_id;
00574 bool mem_initialized;
00575
00576 #ifdef DYNPROG_TIMING
00577 CTime MyTime;
00578 CTime MyTime2;
00579 CTime MyTime3;
00580
00581 float64_t segment_init_time;
00582 float64_t segment_pos_time;
00583 float64_t segment_clean_time;
00584 float64_t segment_extend_time;
00585 float64_t orf_time;
00586 float64_t content_time;
00587 float64_t content_penalty_time;
00588 float64_t content_svm_values_time ;
00589 float64_t content_plifs_time ;
00590 float64_t svm_init_time;
00591 float64_t svm_pos_time;
00592 float64_t inner_loop_time;
00593 float64_t inner_loop_max_time ;
00594 float64_t svm_clean_time;
00595 float64_t long_transition_time ;
00596 #endif
00597
00598
00599 protected:
00604
00605 int32_t m_N;
00606
00608 CDynamicArray<int32_t> m_transition_matrix_a_id;
00609 CDynamicArray<float64_t> m_transition_matrix_a;
00610 CDynamicArray<float64_t> m_transition_matrix_a_deriv;
00611
00613 CDynamicArray<float64_t> m_initial_state_distribution_p;
00614 CDynamicArray<float64_t> m_initial_state_distribution_p_deriv;
00615
00617 CDynamicArray<float64_t> m_end_state_distribution_q;
00618 CDynamicArray<float64_t> m_end_state_distribution_q_deriv;
00619
00621
00623 int32_t m_num_degrees;
00625 int32_t m_num_svms;
00626
00628 CDynamicArray<int32_t> m_word_degree;
00630 CDynamicArray<int32_t> m_cum_num_words;
00632 int32_t * m_cum_num_words_array;
00634 CDynamicArray<int32_t> m_num_words;
00636 int32_t* m_num_words_array;
00638 CDynamicArray<int32_t> m_mod_words;
00640 int32_t* m_mod_words_array;
00642 CDynamicArray<bool> m_sign_words;
00644 bool* m_sign_words_array;
00646 CDynamicArray<int32_t> m_string_words;
00648 int32_t* m_string_words_array;
00649
00651
00653 CDynamicArray<int32_t> m_num_unique_words;
00655 bool m_svm_arrays_clean;
00657 int32_t m_max_a_id;
00658
00659
00661 CDynamicArray<float64_t> m_observation_matrix;
00663 CDynamicArray<int32_t> m_pos;
00665 int32_t m_seq_len;
00667 CDynamicArray<int32_t> m_orf_info;
00669 CDynamicArray<float64_t> m_segment_sum_weights;
00671 CDynamicObjectArray m_plif_list;
00673 CDynamicObjectArray m_PEN;
00675 CDynamicObjectArray m_PEN_state_signals;
00677 CDynamicArray<char> m_genestr;
00692 uint16_t*** m_wordstr;
00694 CDynamicArray<float64_t> m_dict_weights;
00696 CDynamicArray<float64_t> m_segment_loss;
00698 CDynamicArray<int32_t> m_segment_ids;
00700 CDynamicArray<float64_t> m_segment_mask;
00702 CDynamicArray<int32_t> m_my_state_seq;
00704 CDynamicArray<int32_t> m_my_pos_seq;
00706 CDynamicArray<float64_t> m_my_scores;
00708 CDynamicArray<float64_t> m_my_losses;
00709
00712 CSegmentLoss* m_seg_loss_obj;
00713
00714
00716 CDynamicArray<float64_t> m_scores;
00718 CDynamicArray<int32_t> m_states;
00720 CDynamicArray<int32_t> m_positions;
00721
00723 CSparseFeatures<float64_t>* m_seq_sparse1;
00725 CSparseFeatures<float64_t>* m_seq_sparse2;
00727 CPlifMatrix* m_plif_matrices;
00728
00732 CDynamicArray<bool> m_genestr_stop;
00733
00736 CIntronList* m_intron_list;
00737
00739 int32_t m_num_intron_plifs;
00740
00745 CDynamicArray<float64_t> m_lin_feat;
00746
00748 float64_t *m_raw_intensities;
00750 int32_t* m_probe_pos;
00752 int32_t* m_num_probes_cum;
00754 int32_t* m_num_lin_feat_plifs_cum;
00756 int32_t m_num_raw_data;
00757
00759 bool m_long_transitions ;
00762 int32_t m_long_transition_threshold ;
00767
00768
00772 static int32_t word_degree_default[4];
00773
00777 static int32_t cum_num_words_default[5];
00778
00781 static int32_t frame_plifs[3];
00782
00785 static int32_t num_words_default[4];
00786
00788 static int32_t mod_words_default[32];
00789
00791 static bool sign_words_default[16];
00792
00794 static int32_t string_words_default[16];
00795 };
00796 }
00797 #endif