SHOGUN: DynProg.h Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Written (W) 2008-2009 Jonas Behr
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015 
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/base/SGObject.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/lib/config.h>
00021 #include <shogun/structure/PlifMatrix.h>
00022 #include <shogun/structure/PlifBase.h>
00023 #include <shogun/structure/Plif.h>
00024 #include <shogun/structure/IntronList.h>
00025 #include <shogun/structure/SegmentLoss.h>
00026 #include <shogun/features/StringFeatures.h>
00027 #include <shogun/features/SparseFeatures.h>
00028 #include <shogun/distributions/Distribution.h>
00029 #include <shogun/lib/DynamicArray.h>
00030 #include <shogun/lib/DynamicObjectArray.h>
00031 #include <shogun/lib/Time.h>
00032 
00033 #include <stdio.h>
00034 #include <limits.h>
00035 
00036 namespace shogun
00037 {
00038     template <class T> class CSparseFeatures;
00039     class CIntronList;
00040     class CPlifMatrix;
00041     class CSegmentLoss;
00042 
00043     template <class T> class CDynamicArray;
00044 
00045 //#define DYNPROG_TIMING
00046 
00047 #ifdef USE_BIGSTATES
00048 typedef uint16_t T_STATES ;
00049 #else
00050 typedef uint8_t T_STATES ;
00051 #endif
00052 typedef T_STATES* P_STATES ;
00053 
00054 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00055 
00056 struct segment_loss_struct
00057 {
00059     int32_t maxlookback;
00061     int32_t seqlen;
00063     int32_t *segments_changed;
00065     float64_t *num_segment_id;
00067     int32_t *length_segment_id ;
00068 };
00069 #endif
00070 
00076 class CDynProg : public CSGObject
00077 {
00078 public:
00083     CDynProg(int32_t p_num_svms=8);
00084     virtual ~CDynProg();
00085 
00086     // model related functions
00092     void set_num_states(int32_t N);
00093 
00095     int32_t get_num_states();
00096 
00098     int32_t get_num_svms();
00099 
00105     void init_content_svm_value_array(const int32_t p_num_svms);
00106 
00114     void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00115 
00122     void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
00123 
00128     void resize_lin_feat(int32_t num_new_feat);
00133     void set_p_vector(SGVector<float64_t> p);
00134 
00139     void set_q_vector(SGVector<float64_t> q);
00140 
00145     void set_a(SGMatrix<float64_t> a);
00146 
00151     void set_a_id(SGMatrix<int32_t> a);
00152 
00157     void set_a_trans_matrix(SGMatrix<float64_t> a_trans);
00158 
00163     void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array);
00164 
00170     bool check_svm_arrays();
00171 
00176     void set_observation_matrix(SGNDArray<float64_t> seq);
00177 
00184     int32_t get_num_positions();
00185 
00195     void set_content_type_array(SGMatrix<float64_t> seg_path);
00196 
00201     void set_pos(SGVector<int32_t> pos);
00202 
00208     void set_orf_info(SGMatrix<int32_t> orf_info);
00209 
00214     void set_gene_string(SGVector<char> genestr);
00215 
00216 
00221     void set_dict_weights(SGMatrix<float64_t> dictionary_weights);
00222 
00227     void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss);
00228 
00235     void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00236 
00238     void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00239 
00244     void set_plif_matrices(CPlifMatrix* pm);
00245 
00246     // best_path result retrieval functions
00251     SGVector<float64_t> get_scores();
00252 
00257     SGMatrix<int32_t> get_states();
00258 
00263     SGMatrix<int32_t> get_positions();
00264 
00265 
00274     void compute_nbest_paths(int32_t max_num_signals,
00275                          bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00276 
00278 
00290     void best_path_trans_deriv(
00291             int32_t* my_state_seq, int32_t *my_pos_seq,
00292             int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00293 
00294     // additional best_path_trans_deriv functions
00299     void set_my_state_seq(int32_t* my_state_seq);
00300 
00305     void set_my_pos_seq(int32_t* my_pos_seq);
00306 
00314     void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00315 
00323     void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00324 
00325 
00327     inline T_STATES get_N() const
00328     {
00329         return m_N ;
00330     }
00331 
00336     inline void set_q(T_STATES offset, float64_t value)
00337     {
00338         m_end_state_distribution_q[offset]=value;
00339     }
00340 
00345     inline void set_p(T_STATES offset, float64_t value)
00346     {
00347         m_initial_state_distribution_p[offset]=value;
00348     }
00349 
00356     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00357     {
00358       m_transition_matrix_a.element(line_,column)=value; // look also best_path!
00359     }
00360 
00366     inline float64_t get_q(T_STATES offset) const
00367     {
00368         return m_end_state_distribution_q[offset];
00369     }
00370 
00376     inline float64_t get_q_deriv(T_STATES offset) const
00377     {
00378         return m_end_state_distribution_q_deriv[offset];
00379     }
00380 
00386     inline float64_t get_p(T_STATES offset) const
00387     {
00388         return m_initial_state_distribution_p[offset];
00389     }
00390 
00396     inline float64_t get_p_deriv(T_STATES offset) const
00397     {
00398         return m_initial_state_distribution_p_deriv[offset];
00399     }
00400 
00404     void precompute_content_values();
00405 
00412     inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00413     {
00414         m_lin_feat.get_array_size(dim1, dim2);
00415         return m_lin_feat.get_array();
00416     }
00425     inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00426     {
00427       m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00428     }
00433     void create_word_string();
00434 
00437     void precompute_stop_codons();
00438 
00445     inline float64_t get_a(T_STATES line_, T_STATES column) const
00446     {
00447       return m_transition_matrix_a.element(line_, column); // look also best_path()!
00448     }
00449 
00456     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00457     {
00458       return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()!
00459     }
00461 
00466     void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00467 
00469     CSegmentLoss* get_segment_loss_object()
00470     {
00471         return m_seg_loss_obj;
00472     }
00473 
00480     void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00481     {
00482         m_long_transitions = use_long_transitions;
00483         m_long_transition_threshold = threshold;
00484         SG_DEBUG("ignoring max_len\n") ;
00485         //m_long_transition_max = max_len;
00486     }
00487 
00488 protected:
00489 
00490     /* helper functions */
00491 
00501     void lookup_content_svm_values(const int32_t from_state,
00502         const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00503         float64_t* svm_values, int32_t frame);
00504 
00512     inline void lookup_tiling_plif_values(const int32_t from_state,
00513         const int32_t to_state, const int32_t len, float64_t* svm_values);
00514 
00519     inline int32_t find_frame(const int32_t from_state);
00520 
00529     inline int32_t raw_intensities_interval_query(
00530         const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00531 
00532 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00533 
00534     struct svm_values_struct
00535     {
00537         int32_t maxlookback;
00539         int32_t seqlen;
00540 
00542         int32_t* start_pos;
00544         float64_t ** svm_values_unnormalized;
00546         float64_t * svm_values;
00548         bool *** word_used;
00550         int32_t **num_unique_words;
00551     };
00552 #endif // DOXYGEN_SHOULD_SKIP_THIS
00553 
00562     bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00563 
00565     virtual const char* get_name() const { return "DynProg"; }
00566 
00567 private:
00568 
00569     T_STATES trans_list_len;
00570     T_STATES **trans_list_forward;
00571     T_STATES *trans_list_forward_cnt;
00572     float64_t **trans_list_forward_val;
00573     int32_t **trans_list_forward_id;
00574     bool mem_initialized;
00575 
00576 #ifdef DYNPROG_TIMING
00577     CTime MyTime;
00578     CTime MyTime2;
00579     CTime MyTime3;
00580 
00581     float64_t segment_init_time;
00582     float64_t segment_pos_time;
00583     float64_t segment_clean_time;
00584     float64_t segment_extend_time;
00585     float64_t orf_time;
00586     float64_t content_time;
00587     float64_t content_penalty_time;
00588     float64_t content_svm_values_time ;
00589     float64_t content_plifs_time ;
00590     float64_t svm_init_time;
00591     float64_t svm_pos_time;
00592     float64_t inner_loop_time;
00593     float64_t inner_loop_max_time ;
00594     float64_t svm_clean_time;
00595     float64_t long_transition_time ;
00596 #endif
00597 
00598 
00599 protected:
00604 
00605     int32_t m_N;
00606 
00608     CDynamicArray<int32_t> m_transition_matrix_a_id; // 2d
00609     CDynamicArray<float64_t> m_transition_matrix_a; // 2d
00610     CDynamicArray<float64_t> m_transition_matrix_a_deriv; // 2d
00611 
00613     CDynamicArray<float64_t> m_initial_state_distribution_p;
00614     CDynamicArray<float64_t> m_initial_state_distribution_p_deriv;
00615 
00617     CDynamicArray<float64_t> m_end_state_distribution_q;
00618     CDynamicArray<float64_t> m_end_state_distribution_q_deriv;
00619 
00621 
00623     int32_t m_num_degrees;
00625     int32_t m_num_svms;
00626 
00628     CDynamicArray<int32_t> m_word_degree;
00630     CDynamicArray<int32_t> m_cum_num_words;
00632     int32_t * m_cum_num_words_array;
00634     CDynamicArray<int32_t> m_num_words;
00636     int32_t* m_num_words_array;
00638     CDynamicArray<int32_t> m_mod_words; // 2d
00640     int32_t* m_mod_words_array;
00642     CDynamicArray<bool> m_sign_words;
00644     bool* m_sign_words_array;
00646     CDynamicArray<int32_t> m_string_words;
00648     int32_t* m_string_words_array;
00649 
00651 //  CDynamicArray<int32_t> m_svm_pos_start;
00653     CDynamicArray<int32_t> m_num_unique_words;
00655     bool m_svm_arrays_clean;
00657     int32_t m_max_a_id;
00658 
00659     // input arguments
00661     CDynamicArray<float64_t> m_observation_matrix; //3d
00663     CDynamicArray<int32_t> m_pos;
00665     int32_t m_seq_len;
00667     CDynamicArray<int32_t> m_orf_info; // 2d
00669     CDynamicArray<float64_t> m_segment_sum_weights; // 2d
00671     CDynamicObjectArray m_plif_list; // CPlifBase*
00673     CDynamicObjectArray m_PEN; // 2d, CPlifBase*
00675     CDynamicObjectArray m_PEN_state_signals; // 2d, CPlifBase*
00677     CDynamicArray<char> m_genestr;
00692     uint16_t*** m_wordstr;
00694     CDynamicArray<float64_t> m_dict_weights; // 2d
00696     CDynamicArray<float64_t> m_segment_loss; // 3d
00698     CDynamicArray<int32_t> m_segment_ids;   
00700     CDynamicArray<float64_t> m_segment_mask;    
00702     CDynamicArray<int32_t> m_my_state_seq;
00704     CDynamicArray<int32_t> m_my_pos_seq;
00706     CDynamicArray<float64_t> m_my_scores;
00708     CDynamicArray<float64_t> m_my_losses;
00709 
00712     CSegmentLoss* m_seg_loss_obj;
00713 
00714     // output arguments
00716     CDynamicArray<float64_t> m_scores;
00718     CDynamicArray<int32_t> m_states; // 2d
00720     CDynamicArray<int32_t> m_positions; // 2d
00721 
00723     CSparseFeatures<float64_t>* m_seq_sparse1;
00725     CSparseFeatures<float64_t>* m_seq_sparse2;
00727     CPlifMatrix* m_plif_matrices;
00728 
00732     CDynamicArray<bool> m_genestr_stop;
00733 
00736     CIntronList* m_intron_list;
00737 
00739     int32_t m_num_intron_plifs;
00740 
00745     CDynamicArray<float64_t> m_lin_feat; // 2d
00746 
00748     float64_t *m_raw_intensities;
00750     int32_t* m_probe_pos;
00752     int32_t* m_num_probes_cum;
00754     int32_t* m_num_lin_feat_plifs_cum;
00756     int32_t m_num_raw_data;
00757 
00759     bool m_long_transitions ;
00762     int32_t m_long_transition_threshold  ;
00767     //int32_t m_long_transition_max ;
00768 
00772     static int32_t word_degree_default[4];
00773 
00777     static int32_t cum_num_words_default[5];
00778 
00781     static int32_t frame_plifs[3];
00782 
00785     static int32_t num_words_default[4];
00786 
00788     static int32_t mod_words_default[32];
00789 
00791     static bool sign_words_default[16];
00792 
00794     static int32_t string_words_default[16];
00795 };
00796 }
00797 #endif