DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Written (W) 2008-2009 Jonas Behr
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015 
00016 #include <shogun/mathematics/Math.h>
00017 #include <shogun/lib/common.h>
00018 #include <shogun/base/SGObject.h>
00019 #include <shogun/io/SGIO.h>
00020 #include <shogun/lib/config.h>
00021 #include <shogun/structure/PlifMatrix.h>
00022 #include <shogun/structure/PlifBase.h>
00023 #include <shogun/structure/Plif.h>
00024 #include <shogun/structure/IntronList.h>
00025 #include <shogun/structure/SegmentLoss.h>
00026 #include <shogun/features/StringFeatures.h>
00027 #include <shogun/features/SparseFeatures.h>
00028 #include <shogun/distributions/Distribution.h>
00029 #include <shogun/lib/DynamicArray.h>
00030 #include <shogun/lib/Array.h>
00031 #include <shogun/lib/Array2.h>
00032 #include <shogun/lib/Array3.h>
00033 #include <shogun/lib/Time.h>
00034 
00035 #include <stdio.h>
00036 #include <limits.h>
00037 
00038 namespace shogun
00039 {
00040     template <class T> class CSparseFeatures;
00041     class CIntronList;
00042     class CPlifMatrix;
00043     class CSegmentLoss;
00044     template <class T> class CArray;
00045 
00046 //#define DYNPROG_TIMING
00047 
00048 #ifdef USE_BIGSTATES
00049 typedef uint16_t T_STATES ;
00050 #else
00051 typedef uint8_t T_STATES ;
00052 #endif
00053 typedef T_STATES* P_STATES ;
00054 
00055 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00056 
00057 struct segment_loss_struct
00058 {
00060     int32_t maxlookback;
00062     int32_t seqlen;
00064     int32_t *segments_changed;
00066     float64_t *num_segment_id;
00068     int32_t *length_segment_id ;
00069 };
00070 #endif
00071 
00077 class CDynProg : public CSGObject
00078 {
00079 public:
00084     CDynProg(int32_t p_num_svms=8);
00085     virtual ~CDynProg();
00086     
00087     // model related functions
00093     void set_num_states(int32_t N);
00094 
00096     int32_t get_num_states();
00097 
00099     int32_t get_num_svms();
00100 
00106     void init_content_svm_value_array(const int32_t p_num_svms);
00107 
00115     void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00116 
00123     void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);  
00124 
00129     void resize_lin_feat(int32_t num_new_feat);
00134     void set_p_vector(SGVector<float64_t> p);
00135 
00140     void set_q_vector(SGVector<float64_t> q);
00141     
00146     void set_a(SGMatrix<float64_t> a);
00147     
00152     void set_a_id(SGMatrix<int32_t> a);
00153     
00158     void set_a_trans_matrix(SGMatrix<float64_t> a_trans);
00159 
00164     void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array);
00165 
00171     bool check_svm_arrays();
00172 
00177     void set_observation_matrix(SGNDArray<float64_t> seq);
00178 
00185     int32_t get_num_positions();
00186 
00196     void set_content_type_array(SGMatrix<float64_t> seg_path);
00197 
00202     void set_pos(SGVector<int32_t> pos);
00203 
00209     void set_orf_info(SGMatrix<int32_t> orf_info);
00210 
00215     void set_gene_string(SGVector<char> genestr);
00216 
00217 
00222     void set_dict_weights(SGMatrix<float64_t> dictionary_weights);
00223 
00228     void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss);
00229 
00236     void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00237 
00239     void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00240 
00245     void set_plif_matrices(CPlifMatrix* pm);
00246 
00247     // best_path result retrieval functions
00252     SGVector<float64_t> get_scores();
00253 
00258     SGMatrix<int32_t> get_states();
00259 
00264     SGMatrix<int32_t> get_positions();
00265 
00266 
00275     void compute_nbest_paths(int32_t max_num_signals,
00276                          bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00277 
00279 
00291     void best_path_trans_deriv(
00292             int32_t* my_state_seq, int32_t *my_pos_seq,
00293             int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00294 
00295     // additional best_path_trans_deriv functions
00300     void set_my_state_seq(int32_t* my_state_seq);
00301 
00306     void set_my_pos_seq(int32_t* my_pos_seq);
00307 
00315     void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00316 
00324     void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00325 
00326 
00328     inline T_STATES get_N() const
00329     {
00330         return m_N ;
00331     }
00332     
00337     inline void set_q(T_STATES offset, float64_t value)
00338     {
00339         m_end_state_distribution_q[offset]=value;
00340     }
00341 
00346     inline void set_p(T_STATES offset, float64_t value)
00347     {
00348         m_initial_state_distribution_p[offset]=value;
00349     }
00350 
00357     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00358     {
00359       m_transition_matrix_a.element(line_,column)=value; // look also best_path!
00360     }
00361 
00367     inline float64_t get_q(T_STATES offset) const
00368     {
00369         return m_end_state_distribution_q[offset];
00370     }
00371 
00377     inline float64_t get_q_deriv(T_STATES offset) const
00378     {
00379         return m_end_state_distribution_q_deriv[offset];
00380     }
00381 
00387     inline float64_t get_p(T_STATES offset) const
00388     {
00389         return m_initial_state_distribution_p[offset];
00390     }
00391 
00397     inline float64_t get_p_deriv(T_STATES offset) const
00398     {
00399         return m_initial_state_distribution_p_deriv[offset];
00400     }
00401     
00405     void precompute_content_values();
00406 
00413     inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 
00414     {
00415         m_lin_feat.get_array_size(dim1, dim2);
00416         return m_lin_feat.get_array();
00417     }
00426     inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 
00427     {
00428       m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00429     }
00434     void create_word_string();
00435 
00438     void precompute_stop_codons();
00439 
00446     inline float64_t get_a(T_STATES line_, T_STATES column) const
00447     {
00448       return m_transition_matrix_a.element(line_, column); // look also best_path()!
00449     }
00450 
00457     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00458     {
00459       return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()!
00460     }
00462 
00467     void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00468 
00470     CSegmentLoss* get_segment_loss_object()
00471     {
00472         return m_seg_loss_obj;
00473     }
00474 
00481     void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00482     {
00483         m_long_transitions = use_long_transitions;
00484         m_long_transition_threshold = threshold;
00485         SG_DEBUG("ignoring max_len\n") ;
00486         //m_long_transition_max = max_len;
00487     }
00488 
00489 protected:
00490 
00491     /* helper functions */
00492 
00502     void lookup_content_svm_values(const int32_t from_state,
00503         const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00504         float64_t* svm_values, int32_t frame);
00505 
00513     inline void lookup_tiling_plif_values(const int32_t from_state,
00514         const int32_t to_state, const int32_t len, float64_t* svm_values);
00515 
00520     inline int32_t find_frame(const int32_t from_state);
00521 
00530     inline int32_t raw_intensities_interval_query(
00531         const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00532 
00533 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00534 
00535     struct svm_values_struct
00536     {
00538         int32_t maxlookback;
00540         int32_t seqlen;
00541 
00543         int32_t* start_pos;
00545         float64_t ** svm_values_unnormalized;
00547         float64_t * svm_values;
00549         bool *** word_used;
00551         int32_t **num_unique_words;
00552     };
00553 #endif // DOXYGEN_SHOULD_SKIP_THIS
00554 
00563     bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00564 
00566     inline virtual const char* get_name() const { return "DynProg"; }
00567 
00568 private:
00569 
00570     T_STATES trans_list_len;
00571     T_STATES **trans_list_forward;
00572     T_STATES *trans_list_forward_cnt;
00573     float64_t **trans_list_forward_val;
00574     int32_t **trans_list_forward_id;
00575     bool mem_initialized;
00576 
00577 #ifdef DYNPROG_TIMING
00578     CTime MyTime;
00579     CTime MyTime2;
00580     CTime MyTime3;
00581     
00582     float64_t segment_init_time;
00583     float64_t segment_pos_time;
00584     float64_t segment_clean_time;
00585     float64_t segment_extend_time;
00586     float64_t orf_time;
00587     float64_t content_time;
00588     float64_t content_penalty_time;
00589     float64_t content_svm_values_time ;
00590     float64_t content_plifs_time ;  
00591     float64_t svm_init_time;
00592     float64_t svm_pos_time;
00593     float64_t inner_loop_time;
00594     float64_t inner_loop_max_time ; 
00595     float64_t svm_clean_time;
00596     float64_t long_transition_time ;
00597 #endif
00598     
00599 
00600 protected:
00605 
00606     int32_t m_N;
00607 
00609     CArray2<int32_t> m_transition_matrix_a_id;
00610     CArray2<float64_t> m_transition_matrix_a;
00611     CArray2<float64_t> m_transition_matrix_a_deriv;
00612 
00614     CArray<float64_t> m_initial_state_distribution_p;
00615     CArray<float64_t> m_initial_state_distribution_p_deriv;
00616 
00618     CArray<float64_t> m_end_state_distribution_q;
00619     CArray<float64_t> m_end_state_distribution_q_deriv;
00620 
00622         
00624     int32_t m_num_degrees;
00626     int32_t m_num_svms;
00627 
00629     CArray<int32_t> m_word_degree;
00631     CArray<int32_t> m_cum_num_words;
00633     int32_t * m_cum_num_words_array;
00635     CArray<int32_t> m_num_words;
00637     int32_t* m_num_words_array;
00639     CArray2<int32_t> m_mod_words;
00641     int32_t* m_mod_words_array;
00643     CArray<bool> m_sign_words;
00645     bool* m_sign_words_array;
00647     CArray<int32_t> m_string_words;
00649     int32_t* m_string_words_array;
00650 
00652 //  CArray<int32_t> m_svm_pos_start;
00654     CArray<int32_t> m_num_unique_words;
00656     bool m_svm_arrays_clean;
00658     int32_t m_max_a_id;
00659     
00660     // input arguments
00662     CArray3<float64_t> m_observation_matrix;
00664     CArray<int32_t> m_pos;
00666     int32_t m_seq_len; 
00668     CArray2<int32_t> m_orf_info;
00670     CArray2<float64_t> m_segment_sum_weights;
00672     CArray<CPlifBase*> m_plif_list;
00674     CArray2<CPlifBase*> m_PEN;
00676     CArray2<CPlifBase*> m_PEN_state_signals;
00678     CArray<char> m_genestr;
00693     uint16_t*** m_wordstr;
00695     CArray2<float64_t> m_dict_weights;
00697     CArray3<float64_t> m_segment_loss;
00699     CArray<int32_t> m_segment_ids;
00701     CArray<float64_t> m_segment_mask;
00703     CArray<int32_t> m_my_state_seq;
00705     CArray<int32_t> m_my_pos_seq;
00707     CArray<float64_t> m_my_scores;
00709     CArray<float64_t> m_my_losses;
00710 
00713     CSegmentLoss* m_seg_loss_obj;
00714 
00715     // output arguments
00717     CArray<float64_t> m_scores;
00719     CArray2<int32_t> m_states;
00721     CArray2<int32_t> m_positions;
00722 
00724     CSparseFeatures<float64_t>* m_seq_sparse1;
00726     CSparseFeatures<float64_t>* m_seq_sparse2;
00728     CPlifMatrix* m_plif_matrices;
00729 
00733     CArray<bool> m_genestr_stop;
00734 
00737     CIntronList* m_intron_list;
00738 
00740     int32_t m_num_intron_plifs;
00741 
00746     CArray2<float64_t> m_lin_feat;
00747 
00749     float64_t *m_raw_intensities;
00751     int32_t* m_probe_pos;
00753     int32_t* m_num_probes_cum;
00755     int32_t* m_num_lin_feat_plifs_cum;
00757     int32_t m_num_raw_data;
00758 
00760     bool m_long_transitions ;
00763     int32_t m_long_transition_threshold  ;
00768     //int32_t m_long_transition_max ;
00769 
00773     static int32_t word_degree_default[4];
00774 
00778     static int32_t cum_num_words_default[5];
00779 
00782     static int32_t frame_plifs[3];
00783 
00786     static int32_t num_words_default[4];
00787 
00789     static int32_t mod_words_default[32];
00790 
00792     static bool sign_words_default[16];
00793 
00795     static int32_t string_words_default[16];
00796 };
00797 }
00798 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation