SHOGUN: TwoStateModel.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Fernando José Iglesias García
00008  * Copyright (C) 2012 Fernando José Iglesias García
00009  */
00010 
00011 #include <shogun/structure/TwoStateModel.h>
00012 #include <shogun/mathematics/Math.h>
00013 #include <shogun/features/MatrixFeatures.h>
00014 
00015 using namespace shogun;
00016 
00017 CTwoStateModel::CTwoStateModel() : CStateModel()
00018 {
00019     // The number of states in this state model is equal to four.
00020     // Although parameters are learnt only for two of them, other
00021     // two states (start and stop) are used
00022     m_num_states = 4;
00023     m_num_transmission_params = 4;
00024 
00025     m_state_loss_mat = SGMatrix< float64_t >(m_num_states, m_num_states);
00026     m_state_loss_mat.zero();
00027     for ( int32_t i = 0 ; i < m_num_states-1 ; ++i )
00028     {
00029         m_state_loss_mat(m_num_states-1, i) = 1;
00030         m_state_loss_mat(i, m_num_states-1) = 1;
00031     }
00032 
00033     // Initialize the start and stop states
00034     m_p = SGVector< float64_t >(m_num_states);
00035     m_q = SGVector< float64_t >(m_num_states);
00036     m_p.set_const(-CMath::INFTY);
00037     m_q.set_const(-CMath::INFTY);
00038     m_p[0] = 0; // start state
00039     m_q[1] = 0; // stop  state
00040 }
00041 
00042 CTwoStateModel::~CTwoStateModel()
00043 {
00044 }
00045 
00046 SGMatrix< float64_t > CTwoStateModel::loss_matrix(CSequence* label_seq)
00047 {
00048     SGVector< int32_t > state_seq = labels_to_states(label_seq);
00049     SGMatrix< float64_t > loss_mat(m_num_states, state_seq.vlen);
00050 
00051     for ( int32_t i = 0 ; i < loss_mat.num_cols ; ++i )
00052     {
00053         for ( int32_t s = 0 ; s < loss_mat.num_rows ; ++s )
00054             loss_mat(s,i) = m_state_loss_mat(s, state_seq[i]);
00055     }
00056 
00057     return loss_mat;
00058 }
00059 
00060 float64_t CTwoStateModel::loss(CSequence* label_seq_lhs, CSequence* label_seq_rhs)
00061 {
00062     SGVector< int32_t > state_seq_lhs = labels_to_states(label_seq_lhs);
00063     SGVector< int32_t > state_seq_rhs = labels_to_states(label_seq_rhs);
00064 
00065     ASSERT(state_seq_lhs.vlen == state_seq_rhs.vlen);
00066 
00067     float64_t ret = 0.0;
00068     for ( int32_t i = 0 ; i < state_seq_lhs.vlen ; ++i )
00069         ret += m_state_loss_mat(state_seq_lhs[i], state_seq_rhs[i]);
00070 
00071     return ret;
00072 }
00073 
00074 SGVector< int32_t > CTwoStateModel::labels_to_states(CSequence* label_seq) const
00075 {
00076     // 0 -> start state
00077     // 1 -> stop state
00078     // 2 -> negative state (label == 0)
00079     // 3 -> positive state (label == 1)
00080 
00081     SGVector< int32_t > seq_data = label_seq->get_data();
00082     SGVector< int32_t > state_seq(seq_data.size());
00083     for ( int32_t i = 1 ; i < state_seq.vlen-1 ; ++i )
00084     {
00085         //FIXME make independent of values 0-1 in labels
00086         state_seq[i] = seq_data[i] + 2;
00087     }
00088 
00089     // The first element is always start state
00090     state_seq[0] = 0;
00091     // The last element is always stop state
00092     state_seq[state_seq.vlen-1] = 1;
00093 
00094     return state_seq;
00095 }
00096 
00097 CSequence* CTwoStateModel::states_to_labels(SGVector< int32_t > state_seq) const
00098 {
00099     SGVector< int32_t > label_seq(state_seq.vlen);
00100 
00101     //FIXME make independent of values 0-1 in labels
00102     // Legend for state indices:
00103     // 0 -> start state => label 0
00104     // 1 -> stop state => label 0
00105     // 2 -> negative state (label == 0) => label 0
00106     // 3 -> positive state (label == 1) => label 1
00107     label_seq.zero();
00108     for ( int32_t i = 0 ; i < state_seq.vlen ; ++i )
00109     {
00110         if ( state_seq[i] == 3 )
00111             label_seq[i] = 1;
00112     }
00113 
00114     CSequence* ret = new CSequence(label_seq);
00115     SG_REF(ret);
00116     return ret;
00117 }
00118 
00119 void CTwoStateModel::reshape_emission_params(SGVector< float64_t >& emission_weights,
00120         SGVector< float64_t > w, int32_t num_feats, int32_t num_obs)
00121 {
00122     emission_weights.zero();
00123 
00124     // Legend for state indices:
00125     // 0 -> start state
00126     // 1 -> stop state
00127     // 2 -> negative state (label == 0)
00128     // 3 -> positive state (label == 1)
00129     //
00130     // start and stop states have no emission scores
00131 
00132     index_t em_idx, w_idx = m_num_transmission_params;
00133     for ( int32_t s = 2 ; s < m_num_states ; ++s )
00134     {
00135         for ( int32_t f = 0 ; f < num_feats ; ++f )
00136         {
00137             for ( int32_t o = 0 ; o < num_obs ; ++o )
00138             {
00139                 em_idx = s*num_feats*num_obs + f*num_obs + o;
00140                 emission_weights[em_idx] = w[w_idx++];
00141             }
00142         }
00143     }
00144 }
00145 
00146 void CTwoStateModel::reshape_transmission_params(
00147         SGMatrix< float64_t >& transmission_weights, SGVector< float64_t > w)
00148 {
00149     transmission_weights.set_const(-CMath::INFTY);
00150 
00151     // Legend for state indices:
00152     // 0 -> start state
00153     // 1 -> stop state
00154     // 2 -> negative state (label == 0)
00155     // 3 -> positive state (label == 1)
00156 
00157     // From start
00158     transmission_weights(0,2) = 0;    // to negative
00159     transmission_weights(0,3) = 0;    // to positive
00160     // From negative
00161     transmission_weights(2,1) = 0;    // to stop
00162     transmission_weights(2,2) = w[0]; // to negative
00163     transmission_weights(2,3) = w[1]; // to positive
00164     // From positive
00165     transmission_weights(3,1) = 0;    // to stop
00166     transmission_weights(3,2) = w[3]; // to positive
00167     transmission_weights(3,3) = w[2]; // to negative
00168 }
00169 
00170 void CTwoStateModel::weights_to_vector(SGVector< float64_t >& psi,
00171         SGMatrix< float64_t > transmission_weights,
00172         SGVector< float64_t > emission_weights,
00173         int32_t num_feats, int32_t num_obs) const
00174 {
00175     // Legend for state indices:
00176     // 0 -> start state
00177     // 1 -> stop state
00178     // 2 -> negative state
00179     // 3 -> positive state
00180     psi[0] = transmission_weights(2,2);
00181     psi[1] = transmission_weights(2,3);
00182     psi[2] = transmission_weights(3,3);
00183     psi[3] = transmission_weights(3,2);
00184 
00185     // start and stop states have no emission scores
00186     index_t obs_idx, psi_idx = m_num_transmission_params;
00187     for ( int32_t s = 2 ; s < m_num_states ; ++s )
00188     {
00189         for ( int32_t f = 0 ; f < num_feats ; ++f )
00190         {
00191             for ( int32_t o = 0 ; o < num_obs ; ++o )
00192             {
00193                 obs_idx = s*num_feats*num_obs + f*num_obs + o;
00194                 psi[psi_idx++] = emission_weights[obs_idx];
00195             }
00196         }
00197     }
00198 
00199 }
00200 
00201 SGVector< int32_t > CTwoStateModel::get_monotonicity(int32_t num_free_states,
00202         int32_t num_feats) const
00203 {
00204     REQUIRE(num_free_states == 2, "Using the TwoStateModel only two states are free\n");
00205 
00206     SGVector< int32_t > monotonicity(num_feats*num_free_states);
00207 
00208     for ( int32_t i = 0 ; i < num_feats ; ++i )
00209         monotonicity[i] = -1;
00210     for ( int32_t i = num_feats ; i < 2*num_feats ; ++i )
00211         monotonicity[i] = +1;
00212 
00213     return monotonicity;
00214 }
00215 
00216 CHMSVMModel* CTwoStateModel::simulate_two_state_data()
00217 {
00218     // Number of examples
00219     int32_t num_exm = 1000;
00220     // Length of each example sequence
00221     int32_t exm_len = 250;
00222     // Number of different states
00223     int32_t num_states = 2;
00224     // Total number of features
00225     int32_t num_features = 10;
00226     // Number of features to be pure noise
00227     int32_t num_noise_features = 2;
00228     // Min and max length of positive block
00229     int32_t block_len[] = {10, 100};
00230     // Min and max number of positive blocks per example
00231     int32_t num_blocks[] = {0, 3};
00232 
00233     // Proportion of wrong labels
00234     float64_t prop_distort = 0.2;
00235     // Standard deviation of Gaussian noise
00236     float64_t noise_std = 4;
00237 
00238     // Generate label sequence randomly containing from num_blocks[0] to
00239     // num_blocks[1] blocks of positive labels each of length between
00240     // block_len[0] and block_len[1]
00241 
00242     CHMSVMLabels* labels = new CHMSVMLabels(num_exm, num_states);
00243     SGVector< int32_t > ll(num_exm*exm_len);
00244     ll.zero();
00245     int32_t rnb, rl, rp;
00246 
00247     for ( int32_t i = 0 ; i < num_exm ; ++i)
00248     {
00249         SGVector< int32_t > lab(exm_len);
00250         lab.zero();
00251         rnb = num_blocks[0] + CMath::ceil((num_blocks[1]-num_blocks[0])*
00252             CMath::random(0.0, 1.0)) - 1;
00253 
00254         for ( int32_t j = 0 ; j < rnb ; ++j )
00255         {
00256             rl = block_len[0] + CMath::ceil((block_len[1]-block_len[0])*
00257                 CMath::random(0.0, 1.0)) - 1;
00258             rp = CMath::ceil((exm_len-rl)*CMath::random(0.0, 1.0));
00259 
00260             for ( int32_t idx = rp-1 ; idx < rp+rl ; ++idx )
00261             {
00262                 lab[idx] = 1;
00263                 ll[i*exm_len + idx] = 1;
00264             }
00265         }
00266 
00267         labels->add_label(lab);
00268     }
00269 
00270     // Generate features by
00271     // i) introducing label noise, i.e. flipping a propotion prop_distort
00272     // of labels and
00273     // ii) adding Gaussian noise to the (distorted) label sequence
00274 
00275     SGVector< int32_t >   distort(num_exm*exm_len);
00276     SGVector< int32_t >   d1(CMath::round(distort.vlen*prop_distort));
00277     SGVector< int32_t >   d2(d1.vlen);
00278     SGVector< int32_t >   lf;
00279     SGMatrix< float64_t > signal(num_features, distort.vlen);
00280 
00281     for ( int32_t i = 0 ; i < num_features ; ++i )
00282     {
00283         lf = ll;
00284         distort.randperm();
00285 
00286         for ( int32_t j = 0 ; j < d1.vlen ; ++j )
00287             d1[j] = distort[j];
00288 
00289         for ( int32_t j = 0 ; j < d2.vlen ; ++j )
00290             d2[j] = distort[ distort.vlen-d2.vlen+j ];
00291 
00292         for ( int32_t j = 0 ; j < d1.vlen ; ++j )
00293             lf[ d1[j] ] = lf[ d2[j] ];
00294 
00295         int32_t idx = i*signal.num_cols;
00296         for ( int32_t j = 0 ; j < signal.num_cols ; ++j )
00297             signal[idx++] = lf[j] + noise_std*CMath::normal_random((float64_t)0.0, 1.0);
00298     }
00299 
00300     // Substitute some features by pure noise
00301     SGVector< int32_t > ridx(num_features);
00302     ridx.randperm();
00303     for ( int32_t i = 0 ; i < num_noise_features ; ++i )
00304     {
00305         int32_t idx = i*signal.num_cols;
00306         for ( int32_t j = 0 ; j < signal.num_cols ; ++j )
00307             signal[idx++] = noise_std*CMath::normal_random((float64_t)0.0, 1.0);
00308     }
00309 
00310     CMatrixFeatures< float64_t >* features =
00311         new CMatrixFeatures< float64_t >(signal, exm_len, num_exm);
00312 
00313     return new CHMSVMModel(features, labels, SMT_TWO_STATE, 3);
00314 }