DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Written (W) 2008-2009 Jonas Behr
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015 
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifMatrix.h"
00022 #include "structure/PlifBase.h"
00023 #include "structure/Plif.h"
00024 #include "structure/IntronList.h"
00025 #include "structure/SegmentLoss.h"
00026 #include "features/StringFeatures.h"
00027 #include "features/SparseFeatures.h"
00028 #include "distributions/Distribution.h"
00029 #include "lib/DynamicArray.h"
00030 #include "lib/Array.h"
00031 #include "lib/Array2.h"
00032 #include "lib/Array3.h"
00033 #include "lib/Time.h"
00034 
00035 #include <stdio.h>
00036 #include <limits.h>
00037 
00038 namespace shogun
00039 {
00040     template <class T> class CSparseFeatures;
00041     class CIntronList;
00042     class CPlifMatrix;
00043     class CSegmentLoss;
00044     template <class T> class CArray;
00045 
00046 //#define DYNPROG_TIMING
00047 
00048 #ifdef USE_BIGSTATES
00049 typedef uint16_t T_STATES ;
00050 #else
00051 typedef uint8_t T_STATES ;
00052 #endif
00053 typedef T_STATES* P_STATES ;
00054 
00056 struct segment_loss_struct
00057 {
00059     int32_t maxlookback;
00061     int32_t seqlen;
00063     int32_t *segments_changed;
00065     float64_t *num_segment_id;
00067     int32_t *length_segment_id ;
00068 };
00069 
00075 class CDynProg : public CSGObject
00076 {
00077 public:
00082     CDynProg(int32_t p_num_svms=8);
00083     virtual ~CDynProg();
00084     
00085     // model related functions
00091     void set_num_states(int32_t N);
00092 
00094     int32_t get_num_states();
00095 
00097     int32_t get_num_svms();
00098 
00104     void init_content_svm_value_array(const int32_t p_num_svms);
00105 
00113     void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00114 
00121     void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);  
00122 
00127     void resize_lin_feat(int32_t num_new_feat);
00133     void set_p_vector(float64_t* p, int32_t N);
00134 
00140     void set_q_vector(float64_t* q, int32_t N);
00141     
00148     void set_a(float64_t* a, int32_t M, int32_t N);
00149     
00156     void set_a_id(int32_t *a, int32_t M, int32_t N);
00157     
00164     void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00165 
00172     void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00173 
00179     bool check_svm_arrays();
00180 
00187     void set_observation_matrix(float64_t* seq, int32_t* dims, int32_t ndims);
00188 
00195     int32_t get_num_positions();
00196 
00208     void set_content_type_array(float64_t* seg_path, int32_t rows, int32_t cols);
00209 
00215     void set_pos(int32_t* pos, int32_t seq_len);
00216 
00224     void set_orf_info(int32_t* orf_info, int32_t m, int32_t n);
00225 
00231     void set_gene_string(char* genestr, int32_t genestr_len);
00232 
00233 
00240     void set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00241 
00248     void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00249 
00256     void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00257 
00259     void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00260 
00265     void set_plif_matrices(CPlifMatrix* pm);
00266 
00267     // best_path result retrieval functions
00273     void get_scores(float64_t **scores, int32_t *n);
00274 
00281     void get_states(int32_t **states, int32_t *m, int32_t *n);
00282 
00289     void get_positions(int32_t **positions, int32_t *m, int32_t *n);
00290 
00291 
00300     void compute_nbest_paths(int32_t max_num_signals,
00301                          bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00302 
00304 
00316     void best_path_trans_deriv(
00317             int32_t* my_state_seq, int32_t *my_pos_seq,
00318             int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00319 
00320     // additional best_path_trans_deriv functions
00325     void set_my_state_seq(int32_t* my_state_seq);
00326 
00331     void set_my_pos_seq(int32_t* my_pos_seq);
00332 
00340     void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00341 
00349     void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00350 
00351 
00353     inline T_STATES get_N() const
00354     {
00355         return m_N ;
00356     }
00357     
00362     inline void set_q(T_STATES offset, float64_t value)
00363     {
00364         m_end_state_distribution_q[offset]=value;
00365     }
00366 
00371     inline void set_p(T_STATES offset, float64_t value)
00372     {
00373         m_initial_state_distribution_p[offset]=value;
00374     }
00375 
00382     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00383     {
00384       m_transition_matrix_a.element(line_,column)=value; // look also best_path!
00385     }
00386 
00392     inline float64_t get_q(T_STATES offset) const
00393     {
00394         return m_end_state_distribution_q[offset];
00395     }
00396 
00402     inline float64_t get_q_deriv(T_STATES offset) const
00403     {
00404         return m_end_state_distribution_q_deriv[offset];
00405     }
00406 
00412     inline float64_t get_p(T_STATES offset) const
00413     {
00414         return m_initial_state_distribution_p[offset];
00415     }
00416 
00422     inline float64_t get_p_deriv(T_STATES offset) const
00423     {
00424         return m_initial_state_distribution_p_deriv[offset];
00425     }
00426     
00430     void precompute_content_values();
00431 
00438     inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 
00439     {
00440         m_lin_feat.get_array_size(dim1, dim2);
00441         return m_lin_feat.get_array();
00442     }
00451     inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 
00452     {
00453       m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00454     }
00459     void create_word_string();
00460 
00463     void precompute_stop_codons();
00464 
00471     inline float64_t get_a(T_STATES line_, T_STATES column) const
00472     {
00473       return m_transition_matrix_a.element(line_, column); // look also best_path()!
00474     }
00475 
00482     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00483     {
00484       return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()!
00485     }
00487 
00492     void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00493 
00495     CSegmentLoss* get_segment_loss_object()
00496     {
00497         return m_seg_loss_obj;
00498     }
00499 
00506     void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00507     {
00508         m_long_transitions = use_long_transitions;
00509         m_long_transition_threshold = threshold;
00510         SG_DEBUG("ignoring max_len\n") ;
00511         //m_long_transition_max = max_len;
00512     }
00513 
00514 protected:
00515 
00516     /* helper functions */
00517 
00527     void lookup_content_svm_values(const int32_t from_state,
00528         const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00529         float64_t* svm_values, int32_t frame);
00530 
00538     inline void lookup_tiling_plif_values(const int32_t from_state,
00539         const int32_t to_state, const int32_t len, float64_t* svm_values);
00540 
00545     inline int32_t find_frame(const int32_t from_state);
00546 
00555     inline int32_t raw_intensities_interval_query(
00556         const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00557 
00558 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00559 
00560     struct svm_values_struct
00561     {
00563         int32_t maxlookback;
00565         int32_t seqlen;
00566 
00568         int32_t* start_pos;
00570         float64_t ** svm_values_unnormalized;
00572         float64_t * svm_values;
00574         bool *** word_used;
00576         int32_t **num_unique_words;
00577     };
00578 #endif // DOXYGEN_SHOULD_SKIP_THIS
00579 
00588     bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00589 
00591     inline virtual const char* get_name() const { return "DynProg"; }
00592 
00593 private:
00594 
00595     T_STATES trans_list_len;
00596     T_STATES **trans_list_forward;
00597     T_STATES *trans_list_forward_cnt;
00598     float64_t **trans_list_forward_val;
00599     int32_t **trans_list_forward_id;
00600     bool mem_initialized;
00601 
00602 #ifdef DYNPROG_TIMING
00603     CTime MyTime;
00604     CTime MyTime2;
00605     CTime MyTime3;
00606     
00607     float64_t segment_init_time;
00608     float64_t segment_pos_time;
00609     float64_t segment_clean_time;
00610     float64_t segment_extend_time;
00611     float64_t orf_time;
00612     float64_t content_time;
00613     float64_t content_penalty_time;
00614     float64_t content_svm_values_time ;
00615     float64_t content_plifs_time ;  
00616     float64_t svm_init_time;
00617     float64_t svm_pos_time;
00618     float64_t inner_loop_time;
00619     float64_t inner_loop_max_time ; 
00620     float64_t svm_clean_time;
00621     float64_t long_transition_time ;
00622 #endif
00623     
00624 
00625 protected:
00630 
00631     int32_t m_N;
00632 
00634     CArray2<int32_t> m_transition_matrix_a_id;
00635     CArray2<float64_t> m_transition_matrix_a;
00636     CArray2<float64_t> m_transition_matrix_a_deriv;
00637 
00639     CArray<float64_t> m_initial_state_distribution_p;
00640     CArray<float64_t> m_initial_state_distribution_p_deriv;
00641 
00643     CArray<float64_t> m_end_state_distribution_q;
00644     CArray<float64_t> m_end_state_distribution_q_deriv;
00645 
00647         
00649     int32_t m_num_degrees;
00651     int32_t m_num_svms;
00652 
00654     CArray<int32_t> m_word_degree;
00656     CArray<int32_t> m_cum_num_words;
00658     int32_t * m_cum_num_words_array;
00660     CArray<int32_t> m_num_words;
00662     int32_t* m_num_words_array;
00664     CArray2<int32_t> m_mod_words;
00666     int32_t* m_mod_words_array;
00668     CArray<bool> m_sign_words;
00670     bool* m_sign_words_array;
00672     CArray<int32_t> m_string_words;
00674     int32_t* m_string_words_array;
00675 
00677 //  CArray<int32_t> m_svm_pos_start;
00679     CArray<int32_t> m_num_unique_words;
00681     bool m_svm_arrays_clean;
00683     int32_t m_max_a_id;
00684     
00685     // input arguments
00687     CArray3<float64_t> m_observation_matrix;
00689     CArray<int32_t> m_pos;
00691     int32_t m_seq_len; 
00693     CArray2<int32_t> m_orf_info;
00695     CArray2<float64_t> m_segment_sum_weights;
00697     CArray<CPlifBase*> m_plif_list;
00699     CArray2<CPlifBase*> m_PEN;
00701     CArray2<CPlifBase*> m_PEN_state_signals;
00703     CArray<char> m_genestr;
00718     uint16_t*** m_wordstr;
00720     CArray2<float64_t> m_dict_weights;
00722     CArray3<float64_t> m_segment_loss;
00724     CArray<int32_t> m_segment_ids;
00726     CArray<float64_t> m_segment_mask;
00728     CArray<int32_t> m_my_state_seq;
00730     CArray<int32_t> m_my_pos_seq;
00732     CArray<float64_t> m_my_scores;
00734     CArray<float64_t> m_my_losses;
00735 
00738     CSegmentLoss* m_seg_loss_obj;
00739 
00740     // output arguments
00742     CArray<float64_t> m_scores;
00744     CArray2<int32_t> m_states;
00746     CArray2<int32_t> m_positions;
00747 
00749     CSparseFeatures<float64_t>* m_seq_sparse1;
00751     CSparseFeatures<float64_t>* m_seq_sparse2;
00753     CPlifMatrix* m_plif_matrices;
00754 
00758     CArray<bool> m_genestr_stop;
00759 
00762     CIntronList* m_intron_list;
00763 
00765     int32_t m_num_intron_plifs;
00766 
00771     CArray2<float64_t> m_lin_feat;
00772 
00774     float64_t *m_raw_intensities;
00776     int32_t* m_probe_pos;
00778     int32_t* m_num_probes_cum;
00780     int32_t* m_num_lin_feat_plifs_cum;
00782     int32_t m_num_raw_data;
00783 
00785     bool m_long_transitions ;
00788     int32_t m_long_transition_threshold  ;
00793     //int32_t m_long_transition_max ;
00794 
00798     static int32_t word_degree_default[4];
00799 
00803     static int32_t cum_num_words_default[5];
00804 
00807     static int32_t frame_plifs[3];
00808 
00811     static int32_t num_words_default[4];
00812 
00814     static int32_t mod_words_default[32];
00815 
00817     static bool sign_words_default[16];
00818 
00820     static int32_t string_words_default[16];
00821 };
00822 }
00823 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation