SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DynProg.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2007 Soeren Sonnenburg
8  * Written (W) 1999-2007 Gunnar Raetsch
9  * Written (W) 2008-2009 Jonas Behr
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/config.h>
19 #include <shogun/structure/Plif.h>
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <time.h>
25 #include <ctype.h>
26 
27 using namespace shogun;
28 
29 //#define USE_TMP_ARRAYCLASS
30 //#define DYNPROG_DEBUG
31 
32 int32_t CDynProg::word_degree_default[4]={3,4,5,6} ;
33 int32_t CDynProg::cum_num_words_default[5]={0,64,320,1344,5440} ;
34 int32_t CDynProg::frame_plifs[3]={4,5,6};
35 int32_t CDynProg::num_words_default[4]= {64,256,1024,4096} ;
36 int32_t CDynProg::mod_words_default[32] = {1,1,1,1,1,1,1,1,
37  1,1,1,1,1,1,1,1,
38  0,0,0,0,0,0,0,0,
39  0,0,0,0,0,0,0,0} ;
40 bool CDynProg::sign_words_default[16] = {true,true,true,true,true,true,true,true,
41  false,false,false,false,false,false,false,false} ; // whether to use counts or signum of counts
42 int32_t CDynProg::string_words_default[16] = {0,0,0,0,0,0,0,0,
43  1,1,1,1,1,1,1,1} ; // which string should be used
44 
45 CDynProg::CDynProg(int32_t num_svms /*= 8 */)
46 : CSGObject(), m_transition_matrix_a_id(1,1), m_transition_matrix_a(1,1),
47  m_transition_matrix_a_deriv(1,1), m_initial_state_distribution_p(1),
48  m_initial_state_distribution_p_deriv(1), m_end_state_distribution_q(1),
49  m_end_state_distribution_q_deriv(1),
50 
51  // multi svm
52  m_num_degrees(4),
53  m_num_svms(num_svms),
54  m_word_degree(word_degree_default, m_num_degrees, true, true),
55  m_cum_num_words(cum_num_words_default, m_num_degrees+1, true, true),
56  m_cum_num_words_array(m_cum_num_words.get_array()),
57  m_num_words(num_words_default, m_num_degrees, true, true),
58  m_num_words_array(m_num_words.get_array()),
59  m_mod_words(mod_words_default, m_num_svms, 2, true, true),
60  m_mod_words_array(m_mod_words.get_array()),
61  m_sign_words(sign_words_default, m_num_svms, true, true),
62  m_sign_words_array(m_sign_words.get_array()),
63  m_string_words(string_words_default, m_num_svms, true, true),
64  m_string_words_array(m_string_words.get_array()),
65  //m_svm_pos_start(m_num_degrees),
66  m_num_unique_words(m_num_degrees),
67  m_svm_arrays_clean(true),
68 
69  m_max_a_id(0), m_observation_matrix(1,1,1),
70  m_pos(1),
71  m_seq_len(0),
72  m_orf_info(1,2),
73  m_plif_list(1),
74  m_PEN(1,1),
75  m_genestr(1), m_wordstr(NULL), m_dict_weights(1,1), m_segment_loss(1,1,2),
76  m_segment_ids(1),
77  m_segment_mask(1),
78  m_my_state_seq(1),
79  m_my_pos_seq(1),
80  m_my_scores(1),
81  m_my_losses(1),
82  m_scores(1),
83  m_states(1,1),
84  m_positions(1,1),
85 
86  m_seq_sparse1(NULL),
87  m_seq_sparse2(NULL),
88  m_plif_matrices(NULL),
89 
90  m_genestr_stop(1),
91  m_intron_list(NULL),
92  m_num_intron_plifs(0),
93  m_lin_feat(1,1), //by Jonas
94  m_raw_intensities(NULL),
95  m_probe_pos(NULL),
96  m_num_probes_cum(NULL),
97  m_num_lin_feat_plifs_cum(NULL),
98  m_num_raw_data(0),
99 
100  m_long_transitions(true),
101  m_long_transition_threshold(1000)
102 {
103  trans_list_forward = NULL ;
104  trans_list_forward_cnt = NULL ;
105  trans_list_forward_val = NULL ;
106  trans_list_forward_id = NULL ;
107  trans_list_len = 0 ;
108 
109  mem_initialized = true ;
110 
111  m_N=1;
112 
113  m_raw_intensities = NULL;
114  m_probe_pos = NULL;
115  m_num_probes_cum = SG_MALLOC(int32_t, 100);
116  m_num_probes_cum[0] = 0;
117  //m_use_tiling=false;
118  m_num_lin_feat_plifs_cum = SG_MALLOC(int32_t, 100);
120  m_num_raw_data = 0;
121 #ifdef ARRAY_STATISTICS
122  m_word_degree.set_array_name("word_degree");
123 #endif
124 
125  m_transition_matrix_a_id.set_array_name("transition_matrix_a_id");
126  m_transition_matrix_a.set_array_name("transition_matrix_a");
127  m_transition_matrix_a_deriv.set_array_name("transition_matrix_a_deriv");
128  m_mod_words.set_array_name("mod_words");
129  m_orf_info.set_array_name("orf_info");
130  m_segment_sum_weights.set_array_name("segment_sum_weights");
131  m_PEN.set_array_name("PEN");
132  m_PEN_state_signals.set_array_name("PEN_state_signals");
133  m_dict_weights.set_array_name("dict_weights");
134  m_states.set_array_name("states");
135  m_positions.set_array_name("positions");
136  m_lin_feat.set_array_name("lin_feat");
137 
138 
139  m_observation_matrix.set_array_name("m_observation_matrix");
140  m_segment_loss.set_array_name("m_segment_loss");
142 }
143 
145 {
146  if (trans_list_forward_cnt)
147  SG_FREE(trans_list_forward_cnt);
148  if (trans_list_forward)
149  {
150  for (int32_t i=0; i<trans_list_len; i++)
151  {
152  if (trans_list_forward[i])
153  SG_FREE(trans_list_forward[i]);
154  }
155  SG_FREE(trans_list_forward);
156  }
157  if (trans_list_forward_val)
158  {
159  for (int32_t i=0; i<trans_list_len; i++)
160  {
161  if (trans_list_forward_val[i])
162  SG_FREE(trans_list_forward_val[i]);
163  }
164  SG_FREE(trans_list_forward_val);
165  }
166  if (trans_list_forward_id)
167  {
168  for (int32_t i=0; i<trans_list_len; i++)
169  {
170  if (trans_list_forward_id[i])
171  SG_FREE(trans_list_forward_id[i]);
172  }
173  SG_FREE(trans_list_forward_id);
174  }
175  if (m_raw_intensities)
177  if (m_probe_pos)
179  if (m_num_probes_cum)
183 
184  delete m_intron_list;
185 
189 }
190 
193 {
194  return m_num_svms;
195 }
196 
198 {
199  int32_t length=m_genestr.get_dim1();
200 
201  m_genestr_stop.resize_array(length) ;
203  m_genestr_stop.set_array_name("genestr_stop") ;
204  {
205  for (int32_t i=0; i<length-2; i++)
206  if ((m_genestr[i]=='t' || m_genestr[i]=='T') &&
207  (((m_genestr[i+1]=='a' || m_genestr[i+1]=='A') &&
208  (m_genestr[i+2]=='a' || m_genestr[i+2]=='g' || m_genestr[i+2]=='A' || m_genestr[i+2]=='G')) ||
209  ((m_genestr[i+1]=='g'||m_genestr[i+1]=='G') && (m_genestr[i+2]=='a' || m_genestr[i+2]=='A') )))
210  {
211  m_genestr_stop.element(i)=true ;
212  }
213  else
214  m_genestr_stop.element(i)=false ;
215  m_genestr_stop.element(length-2)=false ;
216  m_genestr_stop.element(length-1)=false ;
217  }
218 }
219 
220 void CDynProg::set_num_states(int32_t p_N)
221 {
222  m_N=p_N ;
223 
231 
234 }
235 
237 {
238  return m_N;
239 }
240 
242  int32_t* probe_pos, float64_t* intensities, const int32_t num_probes)
243 {
244  m_num_raw_data++;
246 
247  int32_t* tmp_probe_pos = SG_MALLOC(int32_t, m_num_probes_cum[m_num_raw_data]);
248  float64_t* tmp_raw_intensities = SG_MALLOC(float64_t, m_num_probes_cum[m_num_raw_data]);
249 
250 
251  if (m_num_raw_data==1){
252  memcpy(tmp_probe_pos, probe_pos, num_probes*sizeof(int32_t));
253  memcpy(tmp_raw_intensities, intensities, num_probes*sizeof(float64_t));
254  //SG_PRINT("raw_intens:%f \n",*tmp_raw_intensities+2);
255  }else{
256  memcpy(tmp_probe_pos, m_probe_pos, m_num_probes_cum[m_num_raw_data-1]*sizeof(int32_t));
257  memcpy(tmp_raw_intensities, m_raw_intensities, m_num_probes_cum[m_num_raw_data-1]*sizeof(float64_t));
258  memcpy(tmp_probe_pos+m_num_probes_cum[m_num_raw_data-1], probe_pos, num_probes*sizeof(int32_t));
259  memcpy(tmp_raw_intensities+m_num_probes_cum[m_num_raw_data-1], intensities, num_probes*sizeof(float64_t));
260  }
263  m_probe_pos = tmp_probe_pos; //SG_MALLOC(int32_t, num_probes);
264  m_raw_intensities = tmp_raw_intensities;//SG_MALLOC(float64_t, num_probes);
265 
266  //memcpy(m_probe_pos, probe_pos, num_probes*sizeof(int32_t));
267  //memcpy(m_raw_intensities, intensities, num_probes*sizeof(float64_t));
268 
269 }
270 
271 void CDynProg::init_content_svm_value_array(const int32_t p_num_svms)
272 {
273  m_lin_feat.resize_array(p_num_svms, m_seq_len);
274 
275  // initialize array
276  for (int s=0; s<p_num_svms; s++)
277  for (int p=0; p<m_seq_len; p++)
278  m_lin_feat.set_element(0.0, s, p) ;
279 }
280 
281 void CDynProg::resize_lin_feat(const int32_t num_new_feat)
282 {
283  int32_t dim1, dim2;
284  m_lin_feat.get_array_size(dim1, dim2);
286  ASSERT(dim2==m_seq_len); // == number of candidate positions
287 
288 
289 
290  float64_t* arr = m_lin_feat.get_array();
291  float64_t* tmp = SG_MALLOC(float64_t, (dim1+num_new_feat)*dim2);
292  memset(tmp, 0, (dim1+num_new_feat)*dim2*sizeof(float64_t)) ;
293  for(int32_t j=0;j<m_seq_len;j++)
294  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data-1];k++)
295  tmp[j*(dim1+num_new_feat)+k] = arr[j*dim1+k];
296 
297  m_lin_feat.set_array(tmp, dim1+num_new_feat,dim2, true, true);// copy array and free it later
298  SG_FREE(tmp);
299 
300  /*for(int32_t j=0;j<5;j++)
301  {
302  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data];k++)
303  {
304  SG_PRINT("(%i,%i)%f ",k,j,m_lin_feat.get_element(k,j));
305  }
306  SG_PRINT("\n");
307  }
308  m_lin_feat.get_array_size(dim1,dim2);
309  SG_PRINT("resize_lin_feat: dim1:%i, dim2:%i\n",dim1,dim2);*/
310 
311  //SG_PRINT("resize_lin_feat: done\n");
312 }
313 
315  CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs)
316 {
318  float64_t* tiling_plif = SG_MALLOC(float64_t, num_tiling_plifs);
321  svm_value[i]=0.0;
322  int32_t* tiling_rows = SG_MALLOC(int32_t, num_tiling_plifs);
323  for (int32_t i=0; i<num_tiling_plifs; i++)
324  {
325  tiling_plif[i]=0.0;
326  CPlif * plif = PEN[tiling_plif_ids[i]];
327  tiling_rows[i] = plif->get_use_svm();
328 
329  ASSERT(tiling_rows[i]-1==m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i)
330  }
331  resize_lin_feat(num_tiling_plifs);
332 
333 
334  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[m_num_raw_data-1]];
335  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[m_num_raw_data-1]];
336  int32_t num=m_num_probes_cum[m_num_raw_data-1];
337 
338  for (int32_t pos_idx=0;pos_idx<m_seq_len;pos_idx++)
339  {
340  while (num<m_num_probes_cum[m_num_raw_data]&&*p_tiling_pos<m_pos[pos_idx])
341  {
342  for (int32_t i=0; i<num_tiling_plifs; i++)
343  {
344  svm_value[m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i]=*p_tiling_data;
345  CPlif * plif = PEN[tiling_plif_ids[i]];
346  ASSERT(m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i==plif->get_use_svm()-1);
347  plif->set_do_calc(true);
348  tiling_plif[i]+=plif->lookup_penalty(0,svm_value);
349  plif->set_do_calc(false);
350  }
351  p_tiling_data++;
352  p_tiling_pos++;
353  num++;
354  }
355  for (int32_t i=0; i<num_tiling_plifs; i++)
356  m_lin_feat.set_element(tiling_plif[i],tiling_rows[i]-1,pos_idx);
357  }
358  SG_FREE(svm_value);
359  SG_FREE(tiling_plif);
360  SG_FREE(tiling_rows);
361 }
362 
364 {
366  m_wordstr=SG_MALLOC(uint16_t**, 5440);
367  int32_t k=0;
368  int32_t genestr_len=m_genestr.get_dim1();
369 
370  m_wordstr[k]=SG_MALLOC(uint16_t*, m_num_degrees);
371  for (int32_t j=0; j<m_num_degrees; j++)
372  {
373  m_wordstr[k][j]=NULL ;
374  {
375  m_wordstr[k][j]=SG_MALLOC(uint16_t, genestr_len);
376  for (int32_t i=0; i<genestr_len; i++)
377  switch (m_genestr[i])
378  {
379  case 'A':
380  case 'a': m_wordstr[k][j][i]=0 ; break ;
381  case 'C':
382  case 'c': m_wordstr[k][j][i]=1 ; break ;
383  case 'G':
384  case 'g': m_wordstr[k][j][i]=2 ; break ;
385  case 'T':
386  case 't': m_wordstr[k][j][i]=3 ; break ;
387  default: ASSERT(0) ;
388  }
390  }
391  }
392 }
393 
395 {
396  for (int32_t s=0; s<m_num_svms; s++)
397  m_lin_feat.set_element(0.0, s, 0);
398 
399  for (int32_t p=0 ; p<m_seq_len-1 ; p++)
400  {
401  int32_t from_pos = m_pos[p];
402  int32_t to_pos = m_pos[p+1];
403  float64_t* my_svm_values_unnormalized = SG_MALLOC(float64_t, m_num_svms);
404  //SG_PRINT("%i(%i->%i) ",p,from_pos, to_pos);
405 
406  ASSERT(from_pos<=m_genestr.get_dim1());
407  ASSERT(to_pos<=m_genestr.get_dim1());
408 
409  for (int32_t s=0; s<m_num_svms; s++)
410  my_svm_values_unnormalized[s]=0.0;//precomputed_svm_values.element(s,p);
411 
412  for (int32_t i=from_pos; i<to_pos; i++)
413  {
414  for (int32_t j=0; j<m_num_degrees; j++)
415  {
416  uint16_t word = m_wordstr[0][j][i] ;
417  for (int32_t s=0; s<m_num_svms; s++)
418  {
419  // check if this k-mer should be considered for this SVM
420  if (m_mod_words.get_element(s,0)==3 && i%3!=m_mod_words.get_element(s,1))
421  continue;
422  my_svm_values_unnormalized[s] += m_dict_weights[(word+m_cum_num_words_array[j])+s*m_cum_num_words_array[m_num_degrees]] ;
423  }
424  }
425  }
426  for (int32_t s=0; s<m_num_svms; s++)
427  {
428  float64_t prev = m_lin_feat.get_element(s, p);
429  //SG_PRINT("elem (%i, %i, %f)\n", s, p, prev) ;
430  if (prev<-1e20 || prev>1e20)
431  {
432  SG_ERROR("initialization missing (%i, %i, %f)\n", s, p, prev) ;
433  prev=0 ;
434  }
435  m_lin_feat.set_element(prev + my_svm_values_unnormalized[s], s, p+1);
436  }
437  SG_FREE(my_svm_values_unnormalized);
438  }
439  //for (int32_t j=0; j<m_num_degrees; j++)
440  // SG_FREE(m_wordstr[0][j]);
441  //SG_FREE(m_wordstr[0]);
442 }
443 
445 {
446  if (!(p.vlen==m_N))
447  SG_ERROR("length of start prob vector p (%i) is not equal to the number of states (%i), N: %i\n",p.vlen, m_N);
448 
450 }
451 
453 {
454  if (!(q.vlen==m_N))
455  SG_ERROR("length of end prob vector q (%i) is not equal to the number of states (%i), N: %i\n",q.vlen, m_N);
457 }
458 
460 {
461  ASSERT(a.num_cols==m_N);
462  ASSERT(a.num_rows==m_N);
463  m_transition_matrix_a.set_array(a.matrix, m_N, m_N, true, true);
465 }
466 
468 {
469  ASSERT(a.num_cols==m_N);
470  ASSERT(a.num_rows==m_N);
472  m_max_a_id = 0;
473  for (int32_t i=0; i<m_N; i++)
474  {
475  for (int32_t j=0; j<m_N; j++)
477  }
478 }
479 
481 {
482  int32_t num_trans=a_trans.num_rows;
483  int32_t num_cols=a_trans.num_cols;
484 
485  //CMath::display_matrix(a_trans.matrix,num_trans, num_cols,"a_trans");
486 
487  if (!((num_cols==3) || (num_cols==4)))
488  SG_ERROR("!((num_cols==3) || (num_cols==4)), num_cols: %i\n",num_cols);
489 
490  SG_FREE(trans_list_forward);
491  SG_FREE(trans_list_forward_cnt);
492  SG_FREE(trans_list_forward_val);
493  SG_FREE(trans_list_forward_id);
494 
495  trans_list_forward = NULL ;
496  trans_list_forward_cnt = NULL ;
497  trans_list_forward_val = NULL ;
498  trans_list_len = 0 ;
499 
502 
503  mem_initialized = true ;
504 
505  trans_list_forward_cnt=NULL ;
506  trans_list_len = m_N ;
507  trans_list_forward = SG_MALLOC(T_STATES*, m_N);
508  trans_list_forward_cnt = SG_MALLOC(T_STATES, m_N);
509  trans_list_forward_val = SG_MALLOC(float64_t*, m_N);
510  trans_list_forward_id = SG_MALLOC(int32_t*, m_N);
511 
512  int32_t start_idx=0;
513  for (int32_t j=0; j<m_N; j++)
514  {
515  int32_t old_start_idx=start_idx;
516 
517  while (start_idx<num_trans && a_trans.matrix[start_idx+num_trans]==j)
518  {
519  start_idx++;
520 
521  if (start_idx>1 && start_idx<num_trans)
522  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans]);
523  }
524 
525  if (start_idx>1 && start_idx<num_trans)
526  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans]);
527 
528  int32_t len=start_idx-old_start_idx;
529  ASSERT(len>=0);
530 
531  trans_list_forward_cnt[j] = 0 ;
532 
533  if (len>0)
534  {
535  trans_list_forward[j] = SG_MALLOC(T_STATES, len);
536  trans_list_forward_val[j] = SG_MALLOC(float64_t, len);
537  trans_list_forward_id[j] = SG_MALLOC(int32_t, len);
538  }
539  else
540  {
541  trans_list_forward[j] = NULL;
542  trans_list_forward_val[j] = NULL;
543  trans_list_forward_id[j] = NULL;
544  }
545  }
546 
547  for (int32_t i=0; i<num_trans; i++)
548  {
549  int32_t from_state = (int32_t)a_trans.matrix[i] ;
550  int32_t to_state = (int32_t)a_trans.matrix[i+num_trans] ;
551  float64_t val = a_trans.matrix[i+num_trans*2] ;
552  int32_t id = 0 ;
553  if (num_cols==4)
554  id = (int32_t)a_trans.matrix[i+num_trans*3] ;
555  //SG_DEBUG( "id=%i\n", id) ;
556 
557  ASSERT(to_state>=0 && to_state<m_N);
558  ASSERT(from_state>=0 && from_state<m_N);
559 
560  trans_list_forward[to_state][trans_list_forward_cnt[to_state]]=from_state ;
561  trans_list_forward_val[to_state][trans_list_forward_cnt[to_state]]=val ;
562  trans_list_forward_id[to_state][trans_list_forward_cnt[to_state]]=id ;
563  trans_list_forward_cnt[to_state]++ ;
564  m_transition_matrix_a.element(from_state, to_state) = val ;
565  m_transition_matrix_a_id.element(from_state, to_state) = id ;
566  //SG_PRINT("from_state:%i to_state:%i trans_matrix_a_id:%i \n",from_state, to_state,m_transition_matrix_a_id.element(from_state, to_state));
567  } ;
568 
569  m_max_a_id = 0 ;
570  for (int32_t i=0; i<m_N; i++)
571  for (int32_t j=0; j<m_N; j++)
572  {
573  //if (m_transition_matrix_a_id.element(i,j))
574  //SG_DEBUG( "(%i,%i)=%i\n", i,j, m_transition_matrix_a_id.element(i,j)) ;
576  }
577  //SG_DEBUG( "m_max_a_id=%i\n", m_max_a_id) ;
578 }
579 
580 
582 {
583  //for (int32_t i=0; i<mod_words_input.num_cols; i++)
584  //{
585  // for (int32_t j=0; j<mod_words_input.num_rows; j++)
586  // SG_PRINT("%i ",mod_words_input[i*mod_words_input.num_rows+j]);
587  // SG_PRINT("\n");
588  //}
589  m_svm_arrays_clean=false ;
590 
591  ASSERT(m_num_svms==mod_words_input.num_rows);
592  ASSERT(mod_words_input.num_cols==2);
593 
594  m_mod_words.set_array(mod_words_input.matrix, mod_words_input.num_rows, 2, true, true) ;
596 
597  /*SG_DEBUG( "m_mod_words=[") ;
598  for (int32_t i=0; i<mod_words_input.num_rows; i++)
599  SG_DEBUG( "%i, ", p_mod_words_array[i]) ;
600  SG_DEBUG( "]\n") ;*/
601 }
602 
604 {
605  //SG_DEBUG( "wd_dim1=%d, m_cum_num_words=%d, m_num_words=%d, m_svm_pos_start=%d, num_uniq_w=%d, mod_words_dims=(%d,%d), sign_w=%d,string_w=%d\n m_num_degrees=%d, m_num_svms=%d, m_num_strings=%d", m_word_degree.get_dim1(), m_cum_num_words.get_dim1(), m_num_words.get_dim1(), m_svm_pos_start.get_dim1(), m_num_unique_words.get_dim1(), m_mod_words.get_dim1(), m_mod_words.get_dim2(), m_sign_words.get_dim1(), m_string_words.get_dim1(), m_num_degrees, m_num_svms, m_num_strings);
609  //(word_used.get_dim1()==m_num_degrees) &&
610  //(word_used.get_dim2()==m_num_words[m_num_degrees-1]) &&
611  //(word_used.get_dim3()==m_num_strings) &&
612  // (svm_values_unnormalized.get_dim1()==m_num_degrees) &&
613  // (svm_values_unnormalized.get_dim2()==m_num_svms) &&
614  //(m_svm_pos_start.get_dim1()==m_num_degrees) &&
617  (m_mod_words.get_dim2()==2) &&
620  {
621  m_svm_arrays_clean=true ;
622  return true ;
623  }
624  else
625  {
628  (m_mod_words.get_dim2()==2) &&
631  SG_PRINT("OK\n") ;
632  else
633  SG_PRINT("not OK\n") ;
634 
636  SG_WARNING("SVM array: word_degree.get_dim1()!=m_num_degrees") ;
638  SG_WARNING("SVM array: m_cum_num_words.get_dim1()!=m_num_degrees+1") ;
640  SG_WARNING("SVM array: m_num_words.get_dim1()==m_num_degrees") ;
641  //if (!(m_svm_pos_start.get_dim1()==m_num_degrees))
642  // SG_WARNING("SVM array: m_svm_pos_start.get_dim1()!=m_num_degrees") ;
644  SG_WARNING("SVM array: m_num_unique_words.get_dim1()!=m_num_degrees") ;
645  if (!(m_mod_words.get_dim1()==m_num_svms))
646  SG_WARNING("SVM array: m_mod_words.get_dim1()!=num_svms") ;
647  if (!(m_mod_words.get_dim2()==2))
648  SG_WARNING("SVM array: m_mod_words.get_dim2()!=2") ;
649  if (!(m_sign_words.get_dim1()==m_num_svms))
650  SG_WARNING("SVM array: m_sign_words.get_dim1()!=num_svms") ;
652  SG_WARNING("SVM array: m_string_words.get_dim1()!=num_svms") ;
653 
654  m_svm_arrays_clean=false ;
655  return false ;
656  }
657 }
658 
660 {
661  if (seq.num_dims!=3)
662  SG_ERROR("Expected 3-dimensional Matrix\n");
663 
664  int32_t N=seq.dims[0];
665  int32_t cand_pos=seq.dims[1];
666  int32_t max_num_features=seq.dims[2];
667 
668  if (!m_svm_arrays_clean)
669  {
670  SG_ERROR( "SVM arrays not clean") ;
671  return ;
672  } ;
673 
674  ASSERT(N==m_N);
675  ASSERT(cand_pos==m_seq_len);
678 
679  m_observation_matrix.set_array(seq.array, N, m_seq_len, max_num_features, true, true) ;
680 }
682 {
683  return m_seq_len;
684 }
685 
687 {
688  ASSERT(seg_path.num_rows==2);
689  ASSERT(seg_path.num_cols==m_seq_len);
690 
691  if (seg_path.matrix!=NULL)
692  {
693  int32_t *segment_ids = SG_MALLOC(int32_t, m_seq_len);
694  float64_t *segment_mask = SG_MALLOC(float64_t, m_seq_len);
695  for (int32_t i=0; i<m_seq_len; i++)
696  {
697  segment_ids[i] = (int32_t)seg_path.matrix[2*i] ;
698  segment_mask[i] = seg_path.matrix[2*i+1] ;
699  }
700  best_path_set_segment_ids_mask(segment_ids, segment_mask, m_seq_len) ;
701  SG_FREE(segment_ids);
702  SG_FREE(segment_mask);
703  }
704  else
705  {
706  int32_t *izeros = SG_MALLOC(int32_t, m_seq_len);
708  for (int32_t i=0; i<m_seq_len; i++)
709  {
710  izeros[i]=0 ;
711  dzeros[i]=0.0 ;
712  }
713  best_path_set_segment_ids_mask(izeros, dzeros, m_seq_len) ;
714  SG_FREE(izeros);
715  SG_FREE(dzeros);
716  }
717 }
718 
720 {
721  m_pos.set_array(pos.vector, pos.vlen, true, true) ;
722  m_seq_len = pos.vlen;
723 }
724 
726 {
727  if (orf_info.num_cols!=2)
728  SG_ERROR( "orf_info size incorrect %i!=2\n", orf_info.num_cols) ;
729 
730  m_orf_info.set_array(orf_info.matrix, orf_info.num_rows, orf_info.num_cols, true, true) ;
731  m_orf_info.set_array_name("orf_info") ;
732 }
733 
735 {
736  if ((!seq_sparse1 && seq_sparse2) || (seq_sparse1 && !seq_sparse2))
737  SG_ERROR("Sparse features must either both be NULL or both NON-NULL\n");
738 
741 
742  m_seq_sparse1=seq_sparse1;
743  m_seq_sparse2=seq_sparse2;
746 }
747 
749 {
751 
752  m_plif_matrices=pm;
753 
755 }
756 
758 {
759  ASSERT(genestr.vector);
760  ASSERT(genestr.vlen>0);
761 
762  m_genestr.set_array(genestr.vector, genestr.vlen, true, true) ;
763 }
764 
765 void CDynProg::set_my_state_seq(int32_t* my_state_seq)
766 {
767  ASSERT(my_state_seq && m_seq_len>0);
769  for (int32_t i=0; i<m_seq_len; i++)
770  m_my_state_seq[i]=my_state_seq[i];
771 }
772 
773 void CDynProg::set_my_pos_seq(int32_t* my_pos_seq)
774 {
775  ASSERT(my_pos_seq && m_seq_len>0);
777  for (int32_t i=0; i<m_seq_len; i++)
778  m_my_pos_seq[i]=my_pos_seq[i];
779 }
780 
782 {
783  if (m_num_svms!=dictionary_weights.num_cols)
784  {
785  SG_ERROR( "m_dict_weights array does not match num_svms=%i!=%i\n",
786  m_num_svms, dictionary_weights.num_cols) ;
787  }
788 
789  m_dict_weights.set_array(dictionary_weights.matrix, dictionary_weights.num_rows, m_num_svms, true, true) ;
790 
791  // initialize, so it does not bother when not used
798 }
799 
801 {
802  int32_t m=segment_loss.num_rows;
803  int32_t n=segment_loss.num_cols;
804  // here we need two matrices. Store it in one: 2N x N
805  if (2*m!=n)
806  SG_ERROR( "segment_loss should be 2 x quadratic matrix: %i!=%i\n", 2*m, n) ;
807 
808  if (m!=m_max_a_id+1)
809  SG_ERROR( "segment_loss size should match m_max_a_id: %i!=%i\n", m, m_max_a_id+1) ;
810 
811  m_segment_loss.set_array(segment_loss.matrix, m, n/2, 2, true, true) ;
812  /*for (int32_t i=0; i<n; i++)
813  for (int32_t j=0; j<n; j++)
814  SG_DEBUG( "loss(%i,%i)=%f\n", i,j, m_segment_loss.element(0,i,j)) ;*/
815 }
816 
818  int32_t* segment_ids, float64_t* segment_mask, int32_t m)
819 {
820 
822  SG_ERROR("size of segment_ids or segment_mask (%i) does not match the size of the feature matrix (%i)", m, m_observation_matrix.get_dim2());
823  int32_t max_id = 0;
824  for (int32_t i=1;i<m;i++)
825  max_id = CMath::max(max_id,segment_ids[i]);
826  //SG_PRINT("max_id: %i, m:%i\n",max_id, m);
827  m_segment_ids.set_array(segment_ids, m, true, true) ;
828  m_segment_ids.set_array_name("m_segment_ids");
829  m_segment_mask.set_array(segment_mask, m, true, true) ;
830  m_segment_mask.set_array_name("m_segment_mask");
831 
835 }
836 
838 {
840  memcpy(scores.vector,m_scores.get_array(), sizeof(float64_t)*(m_scores.get_dim1()));
841 
842  return scores;
843 }
844 
846 {
848 
849  int32_t sz = sizeof(int32_t)*( m_states.get_dim1() * m_states.get_dim2() );
850  memcpy(states.matrix ,m_states.get_array(),sz);
851 
852  return states;
853 }
854 
856 {
858 
859  int32_t sz = sizeof(int32_t)*(m_positions.get_dim1()*m_positions.get_dim2());
860  memcpy(positions.matrix, m_positions.get_array(),sz);
861 
862  return positions;
863 }
864 
865 void CDynProg::get_path_scores(float64_t** scores, int32_t* seq_len)
866 {
867  ASSERT(scores && seq_len);
868 
869  *seq_len=m_my_scores.get_dim1();
870 
871  int32_t sz = sizeof(float64_t)*(*seq_len);
872 
873  *scores = SG_MALLOC(float64_t, *seq_len);
874  ASSERT(*scores);
875 
876  memcpy(*scores,m_my_scores.get_array(),sz);
877 }
878 
879 void CDynProg::get_path_losses(float64_t** losses, int32_t* seq_len)
880 {
881  ASSERT(losses && seq_len);
882 
883  *seq_len=m_my_losses.get_dim1();
884 
885  int32_t sz = sizeof(float64_t)*(*seq_len);
886 
887  *losses = SG_MALLOC(float64_t, *seq_len);
888  ASSERT(*losses);
889 
890  memcpy(*losses,m_my_losses.get_array(),sz);
891 }
892 
894 
896  int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
897  int32_t to)
898 {
899 #ifdef DYNPROG_TIMING_DETAIL
900  MyTime.start() ;
901 #endif
902 
903  if (start<0)
904  start=0 ;
905  if (to<0)
906  to=0 ;
907 
908  int32_t orf_target = orf_to-orf_from ;
909  if (orf_target<0) orf_target+=3 ;
910 
911  int32_t pos ;
912  if (last_pos==to)
913  pos = to-orf_to-3 ;
914  else
915  pos=last_pos ;
916 
917  if (pos<0)
918  {
919 #ifdef DYNPROG_TIMING_DETAIL
920  MyTime.stop() ;
921  orf_time += MyTime.time_diff_sec() ;
922 #endif
923  return true ;
924  }
925 
926  for (; pos>=start; pos-=3)
927  if (m_genestr_stop[pos])
928  {
929 #ifdef DYNPROG_TIMING_DETAIL
930  MyTime.stop() ;
931  orf_time += MyTime.time_diff_sec() ;
932 #endif
933  return false ;
934  }
935 
936 
937  last_pos = CMath::min(pos+3,to-orf_to-3) ;
938 
939 #ifdef DYNPROG_TIMING_DETAIL
940  MyTime.stop() ;
941  orf_time += MyTime.time_diff_sec() ;
942 #endif
943  return true ;
944 }
945 
946 void CDynProg::compute_nbest_paths(int32_t max_num_signals, bool use_orf,
947  int16_t nbest, bool with_loss, bool with_multiple_sequences)
948  {
949 
950  //FIXME we need checks here if all the fields are of right size
951  //SG_PRINT("m_seq_len: %i\n", m_seq_len);
952  //SG_PRINT("m_pos[0]: %i\n", m_pos[0]);
953  //SG_PRINT("\n");
954 
955  //FIXME these variables can go away when compute_nbest_paths uses them
956  //instead of the local pointers below
957  const float64_t* seq_array = m_observation_matrix.get_array();
958  m_scores.resize_array(nbest) ;
961 
962  for (int32_t i=0; i<nbest; i++)
963  {
964  m_scores[i]=-1;
965  for (int32_t j=0; j<m_observation_matrix.get_dim2(); j++)
966  {
967  m_states.element(i,j)=-1;
968  m_positions.element(i,j)=-1;
969  }
970  }
971  float64_t* prob_nbest=m_scores.get_array();
972  int32_t* my_state_seq=m_states.get_array();
973  int32_t* my_pos_seq=m_positions.get_array();
974 
975  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
976  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
977  //END FIXME
978 
979 
980 #ifdef DYNPROG_TIMING
981  segment_init_time = 0.0 ;
982  segment_pos_time = 0.0 ;
983  segment_extend_time = 0.0 ;
984  segment_clean_time = 0.0 ;
985  orf_time = 0.0 ;
986  svm_init_time = 0.0 ;
987  svm_pos_time = 0.0 ;
988  svm_clean_time = 0.0 ;
989  inner_loop_time = 0.0 ;
990  content_svm_values_time = 0.0 ;
991  content_plifs_time = 0.0 ;
992  inner_loop_max_time = 0.0 ;
993  long_transition_time = 0.0 ;
994 
995  MyTime2.start() ;
996 #endif
997 
998  if (!m_svm_arrays_clean)
999  {
1000  SG_ERROR( "SVM arrays not clean") ;
1001  return ;
1002  }
1003 
1004 #ifdef DYNPROG_DEBUG
1005  m_transition_matrix_a.set_array_name("transition_matrix");
1010  //SG_PRINT("use_orf = %i\n", use_orf) ;
1011 #endif
1012 
1013  int32_t max_look_back = 1000 ;
1014  bool use_svm = false ;
1015 
1016  SG_DEBUG("m_N:%i, m_seq_len:%i, max_num_signals:%i\n",m_N, m_seq_len, max_num_signals) ;
1017 
1018  //for (int32_t i=0;i<m_N*m_seq_len*max_num_signals;i++)
1019  // SG_PRINT("(%i)%0.2f ",i,seq_array[i]);
1020 
1021  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
1022  PEN.set_array_name("PEN");
1023 
1024  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
1025  PEN_state_signals.set_array_name("state_signals");
1026 
1027  CDynamicArray<float64_t> seq(m_N, m_seq_len) ; // 2d
1028  seq.set_array_name("seq") ;
1029  seq.set_const(0) ;
1030 
1031 #ifdef DYNPROG_DEBUG
1032  SG_PRINT("m_num_raw_data: %i\n",m_num_raw_data);
1033  SG_PRINT("m_num_intron_plifs: %i\n", m_num_intron_plifs);
1034  SG_PRINT("m_num_svms: %i\n", m_num_svms);
1035  SG_PRINT("m_num_lin_feat_plifs_cum: ");
1036  for (int i=0; i<=m_num_raw_data; i++)
1038  SG_PRINT("\n");
1039 #endif
1040 
1042  { // initialize svm_svalue
1044  svm_value[s]=0 ;
1045  }
1046 
1047  { // convert seq_input to seq
1048  // this is independent of the svm values
1049 
1050  CDynamicArray<float64_t> *seq_input=NULL ; // 3d
1051  if (seq_array!=NULL)
1052  {
1053  //SG_PRINT("using dense seq_array\n") ;
1054 
1055  seq_input=new CDynamicArray<float64_t>(seq_array, m_N, m_seq_len, max_num_signals) ;
1056  seq_input->set_array_name("seq_input") ;
1057  //seq_input.display_array() ;
1058 
1059  ASSERT(m_seq_sparse1==NULL) ;
1060  ASSERT(m_seq_sparse2==NULL) ;
1061  } else
1062  {
1063  SG_PRINT("using sparse seq_array\n") ;
1064 
1065  ASSERT(m_seq_sparse1!=NULL) ;
1066  ASSERT(m_seq_sparse2!=NULL) ;
1067  ASSERT(max_num_signals==2) ;
1068  }
1069 
1070  for (int32_t i=0; i<m_N; i++)
1071  for (int32_t j=0; j<m_seq_len; j++)
1072  seq.element(i,j) = 0 ;
1073 
1074  for (int32_t i=0; i<m_N; i++)
1075  for (int32_t j=0; j<m_seq_len; j++)
1076  for (int32_t k=0; k<max_num_signals; k++)
1077  {
1078  if ((PEN_state_signals.element(i,k)==NULL) && (k==0))
1079  {
1080  // no plif
1081  if (seq_input!=NULL)
1082  seq.element(i,j) = seq_input->element(i,j,k) ;
1083  else
1084  {
1085  if (k==0)
1086  seq.element(i,j) = m_seq_sparse1->get_feature(i,j) ;
1087  if (k==1)
1088  seq.element(i,j) = m_seq_sparse2->get_feature(i,j) ;
1089  }
1090  break ;
1091  }
1092  if (PEN_state_signals.element(i,k)!=NULL)
1093  {
1094  if (seq_input!=NULL)
1095  {
1096  // just one plif
1097  if (CMath::is_finite(seq_input->element(i,j,k)))
1098  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(seq_input->element(i,j,k), svm_value) ;
1099  else
1100  // keep infinity values
1101  seq.element(i,j) = seq_input->element(i, j, k) ;
1102  }
1103  else
1104  {
1105  if (k==0)
1106  {
1107  // just one plif
1109  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse1->get_feature(i,j), svm_value) ;
1110  else
1111  // keep infinity values
1112  seq.element(i,j) = m_seq_sparse1->get_feature(i, j) ;
1113  }
1114  if (k==1)
1115  {
1116  // just one plif
1118  seq.element(i,j) += ((CPlifBase*) PEN_state_signals.element(i,k))->lookup_penalty(m_seq_sparse2->get_feature(i,j), svm_value) ;
1119  else
1120  // keep infinity values
1121  seq.element(i,j) = m_seq_sparse2->get_feature(i, j) ;
1122  }
1123  }
1124  }
1125  else
1126  break ;
1127  }
1128  delete seq_input;
1129  SG_FREE(svm_value);
1130  }
1131 
1132  // allow longer transitions than look_back
1133  bool long_transitions = m_long_transitions ;
1134  CDynamicArray<int32_t> long_transition_content_start_position(m_N,m_N) ; // 2d
1135  long_transition_content_start_position.set_array_name("long_transition_content_start_position");
1136 #ifdef DYNPROG_DEBUG
1137  CDynamicArray<int32_t> long_transition_content_end_position(m_N,m_N) ; // 2d
1138  long_transition_content_end_position.set_array_name("long_transition_content_end_position");
1139 #endif
1140  CDynamicArray<int32_t> long_transition_content_start(m_N,m_N) ; // 2d
1141  long_transition_content_start.set_array_name("long_transition_content_start");
1142 
1143  CDynamicArray<float64_t> long_transition_content_scores(m_N,m_N) ; // 2d
1144  long_transition_content_scores.set_array_name("long_transition_content_scores");
1145 #ifdef DYNPROG_DEBUG
1146 
1147  CDynamicArray<float64_t> long_transition_content_scores_pen(m_N,m_N) ; // 2d
1148  long_transition_content_scores_pen.set_array_name("long_transition_content_scores_pen");
1149 
1150  CDynamicArray<float64_t> long_transition_content_scores_prev(m_N,m_N) ; // 2d
1151  long_transition_content_scores_prev.set_array_name("long_transition_content_scores_prev");
1152 
1153  CDynamicArray<float64_t> long_transition_content_scores_elem(m_N,m_N) ; // 2d
1154  long_transition_content_scores_elem.set_array_name("long_transition_content_scores_elem");
1155 #endif
1156  CDynamicArray<float64_t> long_transition_content_scores_loss(m_N,m_N) ; // 2d
1157  long_transition_content_scores_loss.set_array_name("long_transition_content_scores_loss");
1158 
1159  if (nbest!=1)
1160  {
1161  SG_ERROR("Long transitions are not supported for nbest!=1") ;
1162  long_transitions = false ;
1163  }
1164  long_transition_content_scores.set_const(-CMath::INFTY);
1165 #ifdef DYNPROG_DEBUG
1166  long_transition_content_scores_pen.set_const(0) ;
1167  long_transition_content_scores_elem.set_const(0) ;
1168  long_transition_content_scores_prev.set_const(0) ;
1169 #endif
1170  if (with_loss)
1171  long_transition_content_scores_loss.set_const(0) ;
1172  long_transition_content_start.set_const(0) ;
1173  long_transition_content_start_position.set_const(0) ;
1174 #ifdef DYNPROG_DEBUG
1175  long_transition_content_end_position.set_const(0) ;
1176 #endif
1177 
1179  { // initialize svm_svalue
1181  svm_value[s]=0 ;
1182  }
1183 
1184  CDynamicArray<int32_t> look_back(m_N,m_N) ; // 2d
1185  look_back.set_array_name("look_back");
1186  //CDynamicArray<int32_t> look_back_orig(m_N,m_N) ;
1187  //look_back.set_array_name("look_back_orig");
1188 
1189 
1190  { // determine maximal length of look-back
1191  for (int32_t i=0; i<m_N; i++)
1192  for (int32_t j=0; j<m_N; j++)
1193  {
1194  look_back.set_element(INT_MAX, i, j) ;
1195  //look_back_orig.set_element(INT_MAX, i, j) ;
1196  }
1197 
1198  for (int32_t j=0; j<m_N; j++)
1199  {
1200  // only consider transitions that are actually allowed
1201  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1202  const T_STATES *elem_list = trans_list_forward[j] ;
1203 
1204  for (int32_t i=0; i<num_elem; i++)
1205  {
1206  T_STATES ii = elem_list[i] ;
1207 
1208  CPlifBase *penij=(CPlifBase*) PEN.element(j, ii) ;
1209  if (penij==NULL)
1210  {
1211  if (long_transitions)
1212  {
1213  look_back.set_element(m_long_transition_threshold, j, ii) ;
1214  //look_back_orig.set_element(m_long_transition_max, j, ii) ;
1215  }
1216  continue ;
1217  }
1218 
1219  /* if the transition is an ORF or we do computation with loss, we have to disable long transitions */
1220  if ((m_orf_info.element(ii,0)!=-1) || (m_orf_info.element(j,1)!=-1) || (!long_transitions))
1221  {
1222  look_back.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1223  //look_back_orig.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1224  if (CMath::ceil(penij->get_max_value()) > max_look_back)
1225  {
1226  SG_DEBUG( "%d %d -> value: %f\n", ii,j,penij->get_max_value());
1227  max_look_back = (int32_t) (CMath::ceil(penij->get_max_value()));
1228  }
1229  }
1230  else
1231  {
1232  look_back.set_element(CMath::min( (int32_t)CMath::ceil(penij->get_max_value()), m_long_transition_threshold ), j, ii) ;
1233  //look_back_orig.set_element( (int32_t)CMath::ceil(penij->get_max_value()), j, ii) ;
1234  }
1235 
1236  if (penij->uses_svm_values())
1237  use_svm=true ;
1238  }
1239  }
1240  /* make sure max_look_back is at least as long as a long transition */
1241  if (long_transitions)
1242  max_look_back = CMath::max(m_long_transition_threshold, max_look_back) ;
1243 
1244  /* make sure max_look_back is not longer than the whole string */
1245  max_look_back = CMath::min(m_genestr.get_dim1(), max_look_back) ;
1246 
1247  int32_t num_long_transitions = 0 ;
1248  for (int32_t i=0; i<m_N; i++)
1249  for (int32_t j=0; j<m_N; j++)
1250  {
1251  if (look_back.get_element(i,j)==m_long_transition_threshold)
1252  num_long_transitions++ ;
1253  if (look_back.get_element(i,j)==INT_MAX)
1254  {
1255  if (long_transitions)
1256  {
1257  look_back.set_element(m_long_transition_threshold, i, j) ;
1258  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1259  }
1260  else
1261  {
1262  look_back.set_element(max_look_back, i, j) ;
1263  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1264  }
1265  }
1266  }
1267  SG_DEBUG("Using %i long transitions\n", num_long_transitions) ;
1268  }
1269  //SG_PRINT("max_look_back: %i \n", max_look_back) ;
1270 
1271  //SG_PRINT("use_svm=%i, genestr_len: \n", use_svm, m_genestr.get_dim1()) ;
1272  SG_DEBUG("use_svm=%i\n", use_svm) ;
1273 
1274  SG_DEBUG("maxlook: %d m_N: %d nbest: %d \n", max_look_back, m_N, nbest);
1275  const int32_t look_back_buflen = (max_look_back*m_N+1)*nbest ;
1276  SG_DEBUG("look_back_buflen=%i\n", look_back_buflen) ;
1277  /*const float64_t mem_use = (float64_t)(m_seq_len*m_N*nbest*(sizeof(T_STATES)+sizeof(int16_t)+sizeof(int32_t))+
1278  look_back_buflen*(2*sizeof(float64_t)+sizeof(int32_t))+
1279  m_seq_len*(sizeof(T_STATES)+sizeof(int32_t))+
1280  m_genestr.get_dim1()*sizeof(bool))/(1024*1024);*/
1281 
1282  //bool is_big = (mem_use>200) || (m_seq_len>5000) ;
1283 
1284  /*if (is_big)
1285  {
1286  SG_DEBUG("calling compute_nbest_paths: m_seq_len=%i, m_N=%i, lookback=%i nbest=%i\n",
1287  m_seq_len, m_N, max_look_back, nbest) ;
1288  SG_DEBUG("allocating %1.2fMB of memory\n",
1289  mem_use) ;
1290  }*/
1291  ASSERT(nbest<32000) ;
1292 
1293  CDynamicArray<float64_t> delta(m_seq_len, m_N, nbest) ; // 3d
1294  delta.set_array_name("delta");
1295  float64_t* delta_array = delta.get_array() ;
1296  //delta.set_const(0) ;
1297 
1298  CDynamicArray<T_STATES> psi(m_seq_len, m_N, nbest) ; // 3d
1299  psi.set_array_name("psi");
1300  //psi.set_const(0) ;
1301 
1302  CDynamicArray<int16_t> ktable(m_seq_len, m_N, nbest) ; // 3d
1303  ktable.set_array_name("ktable");
1304  //ktable.set_const(0) ;
1305 
1306  CDynamicArray<int32_t> ptable(m_seq_len, m_N, nbest) ; // 3d
1307  ptable.set_array_name("ptable");
1308  //ptable.set_const(0) ;
1309 
1310  CDynamicArray<float64_t> delta_end(nbest) ;
1311  delta_end.set_array_name("delta_end");
1312  //delta_end.set_const(0) ;
1313 
1314  CDynamicArray<T_STATES> path_ends(nbest) ;
1315  path_ends.set_array_name("path_ends");
1316  //path_ends.set_const(0) ;
1317 
1318  CDynamicArray<int16_t> ktable_end(nbest) ;
1319  ktable_end.set_array_name("ktable_end");
1320  //ktable_end.set_const(0) ;
1321 
1322  float64_t * fixedtempvv=SG_MALLOC(float64_t, look_back_buflen);
1323  memset(fixedtempvv, 0, look_back_buflen*sizeof(float64_t)) ;
1324  int32_t * fixedtempii=SG_MALLOC(int32_t, look_back_buflen);
1325  memset(fixedtempii, 0, look_back_buflen*sizeof(int32_t)) ;
1326 
1327  CDynamicArray<float64_t> oldtempvv(look_back_buflen) ;
1328  oldtempvv.set_array_name("oldtempvv");
1329 
1330  CDynamicArray<float64_t> oldtempvv2(look_back_buflen) ;
1331  oldtempvv2.set_array_name("oldtempvv2");
1332  //oldtempvv.set_const(0) ;
1333  //oldtempvv.display_size() ;
1334 
1335  CDynamicArray<int32_t> oldtempii(look_back_buflen) ;
1336  oldtempii.set_array_name("oldtempii");
1337 
1338  CDynamicArray<int32_t> oldtempii2(look_back_buflen) ;
1339  oldtempii2.set_array_name("oldtempii2");
1340  //oldtempii.set_const(0) ;
1341 
1342  CDynamicArray<T_STATES> state_seq(m_seq_len) ;
1343  state_seq.set_array_name("state_seq");
1344  //state_seq.set_const(0) ;
1345 
1347  pos_seq.set_array_name("pos_seq");
1348  //pos_seq.set_const(0) ;
1349 
1350 
1351  m_dict_weights.set_array_name("dict_weights") ;
1352  m_word_degree.set_array_name("word_degree") ;
1353  m_cum_num_words.set_array_name("cum_num_words") ;
1354  m_num_words.set_array_name("num_words") ;
1355  //word_used.set_array_name("word_used") ;
1356  //svm_values_unnormalized.set_array_name("svm_values_unnormalized") ;
1357  //m_svm_pos_start.set_array_name("svm_pos_start") ;
1358  m_num_unique_words.set_array_name("num_unique_words") ;
1359 
1360  PEN.set_array_name("PEN") ;
1361  seq.set_array_name("seq") ;
1362 
1363  delta.set_array_name("delta") ;
1364  psi.set_array_name("psi") ;
1365  ktable.set_array_name("ktable") ;
1366  ptable.set_array_name("ptable") ;
1367  delta_end.set_array_name("delta_end") ;
1368  path_ends.set_array_name("path_ends") ;
1369  ktable_end.set_array_name("ktable_end") ;
1370 
1371 #ifdef USE_TMP_ARRAYCLASS
1372  fixedtempvv.set_array_name("fixedtempvv") ;
1373  fixedtempii.set_array_name("fixedtempvv") ;
1374 #endif
1375 
1376  oldtempvv.set_array_name("oldtempvv") ;
1377  oldtempvv2.set_array_name("oldtempvv2") ;
1378  oldtempii.set_array_name("oldtempii") ;
1379  oldtempii2.set_array_name("oldtempii2") ;
1380 
1381 
1383 
1384 #ifdef DYNPROG_DEBUG
1385  state_seq.display_size() ;
1386  pos_seq.display_size() ;
1387 
1392  //word_used.display_size() ;
1393  //svm_values_unnormalized.display_size() ;
1394  //m_svm_pos_start.display_array() ;
1396 
1397  PEN.display_size() ;
1398  PEN_state_signals.display_size() ;
1399  seq.display_size() ;
1401 
1402  //m_genestr_stop.display_size() ;
1403  delta.display_size() ;
1404  psi.display_size() ;
1405  ktable.display_size() ;
1406  ptable.display_size() ;
1407  delta_end.display_size() ;
1408  path_ends.display_size() ;
1409  ktable_end.display_size() ;
1410 
1411 #ifdef USE_TMP_ARRAYCLASS
1412  fixedtempvv.display_size() ;
1413  fixedtempii.display_size() ;
1414 #endif
1415 
1416  //oldtempvv.display_size() ;
1417  //oldtempii.display_size() ;
1418 
1419  state_seq.display_size() ;
1420  pos_seq.display_size() ;
1421 
1422  //seq.set_const(0) ;
1423 
1424 #endif //DYNPROG_DEBUG
1425 
1427 
1428 
1429 
1430  {
1431  for (int32_t s=0; s<m_num_svms; s++)
1433  }
1434 
1435 
1436  //CDynamicArray<int32_t*> trans_matrix_svms(m_N,m_N); // 2d
1437  //CDynamicArray<int32_t> trans_matrix_num_svms(m_N,m_N); // 2d
1438 
1439  { // initialization
1440 
1441  for (T_STATES i=0; i<m_N; i++)
1442  {
1443  //delta.element(0, i, 0) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1444  delta.element(delta_array, 0, i, 0, m_seq_len, m_N) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1445  psi.element(0,i,0) = 0 ;
1446  if (nbest>1)
1447  ktable.element(0,i,0) = 0 ;
1448  ptable.element(0,i,0) = 0 ;
1449  for (int16_t k=1; k<nbest; k++)
1450  {
1451  int32_t dim1, dim2, dim3 ;
1452  delta.get_array_size(dim1, dim2, dim3) ;
1453  //SG_DEBUG("i=%i, k=%i -- %i, %i, %i\n", i, k, dim1, dim2, dim3) ;
1454  //delta.element(0, i, k) = -CMath::INFTY ;
1455  delta.element(delta_array, 0, i, k, m_seq_len, m_N) = -CMath::INFTY ;
1456  psi.element(0,i,0) = 0 ; // <--- what's this for?
1457  if (nbest>1)
1458  ktable.element(0,i,k) = 0 ;
1459  ptable.element(0,i,k) = 0 ;
1460  }
1461  /*
1462  for (T_STATES j=0; j<m_N; j++)
1463  {
1464  CPlifBase * penalty = PEN.element(i,j) ;
1465  int32_t num_current_svms=0;
1466  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
1467  if (penalty)
1468  {
1469  SG_PRINT("trans %i -> %i \n",i,j);
1470  penalty->get_used_svms(&num_current_svms, svm_ids);
1471  trans_matrix_svms.set_element(svm_ids,i,j);
1472  for (int32_t l=0;l<num_current_svms;l++)
1473  SG_PRINT("svm_ids[%i]: %i \n",l,svm_ids[l]);
1474  trans_matrix_num_svms.set_element(num_current_svms,i,j);
1475  }
1476  }
1477  */
1478 
1479  }
1480  }
1481 
1482  SG_DEBUG("START_RECURSION \n\n");
1483 
1484  // recursion
1485  for (int32_t t=1; t<m_seq_len; t++)
1486  {
1487  //if (is_big && t%(1+(m_seq_len/1000))==1)
1488  // SG_PROGRESS(t, 0, m_seq_len);
1489  //SG_PRINT("%i\n", t) ;
1490 
1491  for (T_STATES j=0; j<m_N; j++)
1492  {
1493  if (seq.element(j,t)<=-1e20)
1494  { // if we cannot observe the symbol here, then we can omit the rest
1495  for (int16_t k=0; k<nbest; k++)
1496  {
1497  delta.element(delta_array, t, j, k, m_seq_len, m_N) = seq.element(j,t) ;
1498  psi.element(t,j,k) = 0 ;
1499  if (nbest>1)
1500  ktable.element(t,j,k) = 0 ;
1501  ptable.element(t,j,k) = 0 ;
1502  }
1503  }
1504  else
1505  {
1506  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1507  const T_STATES *elem_list = trans_list_forward[j] ;
1508  const float64_t *elem_val = trans_list_forward_val[j] ;
1509  const int32_t *elem_id = trans_list_forward_id[j] ;
1510 
1511  int32_t fixed_list_len = 0 ;
1512  float64_t fixedtempvv_ = CMath::INFTY ;
1513  int32_t fixedtempii_ = 0 ;
1514  bool fixedtemplong = false ;
1515 
1516  for (int32_t i=0; i<num_elem; i++)
1517  {
1518  T_STATES ii = elem_list[i] ;
1519 
1520  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1521 
1522  /*int32_t look_back = max_look_back ;
1523  if (0)
1524  { // find lookback length
1525  CPlifBase *pen = (CPlifBase*) penalty ;
1526  if (pen!=NULL)
1527  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1528  if (look_back>=1e6)
1529  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen) ;
1530  ASSERT(look_back<1e6);
1531  } */
1532 
1533  int32_t look_back_ = look_back.element(j, ii) ;
1534 
1535  int32_t orf_from = m_orf_info.element(ii,0) ;
1536  int32_t orf_to = m_orf_info.element(j,1) ;
1537  if((orf_from!=-1)!=(orf_to!=-1))
1538  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i]) ;
1539  ASSERT((orf_from!=-1)==(orf_to!=-1)) ;
1540 
1541  int32_t orf_target = -1 ;
1542  if (orf_from!=-1)
1543  {
1544  orf_target=orf_to-orf_from ;
1545  if (orf_target<0)
1546  orf_target+=3 ;
1547  ASSERT(orf_target>=0 && orf_target<3) ;
1548  }
1549 
1550  int32_t orf_last_pos = m_pos[t] ;
1551 #ifdef DYNPROG_TIMING
1552  MyTime3.start() ;
1553 #endif
1554  int32_t num_ok_pos = 0 ;
1555 
1556  for (int32_t ts=t-1; ts>=0 && m_pos[t]-m_pos[ts]<=look_back_; ts--)
1557  {
1558  bool ok ;
1559  //int32_t plen=t-ts;
1560 
1561  /*for (int32_t s=0; s<m_num_svms; s++)
1562  if ((fabs(svs.svm_values[s*svs.seqlen+plen]-svs2.svm_values[s*svs.seqlen+plen])>1e-6) ||
1563  (fabs(svs.svm_values[s*svs.seqlen+plen]-svs3.svm_values[s*svs.seqlen+plen])>1e-6))
1564  {
1565  SG_DEBUG( "s=%i, t=%i, ts=%i, %1.5e, %1.5e, %1.5e\n", s, t, ts, svs.svm_values[s*svs.seqlen+plen], svs2.svm_values[s*svs.seqlen+plen], svs3.svm_values[s*svs.seqlen+plen]);
1566  }*/
1567 
1568  if (orf_target==-1)
1569  ok=true ;
1570  else if (m_pos[ts]!=-1 && (m_pos[t]-m_pos[ts])%3==orf_target)
1571  ok=(!use_orf) || extend_orf(orf_from, orf_to, m_pos[ts], orf_last_pos, m_pos[t]) ;
1572  else
1573  ok=false ;
1574 
1575  if (ok)
1576  {
1577 
1578  float64_t segment_loss = 0.0 ;
1579  if (with_loss)
1580  {
1581  segment_loss = m_seg_loss_obj->get_segment_loss(ts, t, elem_id[i]);
1582  //if (segment_loss!=segment_loss2)
1583  //SG_PRINT("segment_loss:%f segment_loss2:%f\n", segment_loss, segment_loss2);
1584  }
1586  // BEST_PATH_TRANS
1588 
1589  int32_t frame = orf_from;//m_orf_info.element(ii,0);
1590  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1591 
1592  float64_t pen_val = 0.0 ;
1593  if (penalty)
1594  {
1595 #ifdef DYNPROG_TIMING_DETAIL
1596  MyTime.start() ;
1597 #endif
1598  pen_val = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1599 
1600 #ifdef DYNPROG_TIMING_DETAIL
1601  MyTime.stop() ;
1602  content_plifs_time += MyTime.time_diff_sec() ;
1603 #endif
1604  }
1605 
1606 #ifdef DYNPROG_TIMING_DETAIL
1607  MyTime.start() ;
1608 #endif
1609  num_ok_pos++ ;
1610 
1611  if (nbest==1)
1612  {
1613  float64_t val = elem_val[i] + pen_val ;
1614  if (with_loss)
1615  val += segment_loss ;
1616 
1617  float64_t mval = -(val + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N)) ;
1618 
1619  if (mval<fixedtempvv_)
1620  {
1621  fixedtempvv_ = mval ;
1622  fixedtempii_ = ii + ts*m_N;
1623  fixed_list_len = 1 ;
1624  fixedtemplong = false ;
1625  }
1626  }
1627  else
1628  {
1629  for (int16_t diff=0; diff<nbest; diff++)
1630  {
1631  float64_t val = elem_val[i] ;
1632  val += pen_val ;
1633  if (with_loss)
1634  val += segment_loss ;
1635 
1636  float64_t mval = -(val + delta.element(delta_array, ts, ii, diff, m_seq_len, m_N)) ;
1637 
1638  /* only place -val in fixedtempvv if it is one of the nbest lowest values in there */
1639  /* fixedtempvv[i], i=0:nbest-1, is sorted so that fixedtempvv[0] <= fixedtempvv[1] <= ...*/
1640  /* fixed_list_len has the number of elements in fixedtempvv */
1641 
1642  if ((fixed_list_len < nbest) || ((0==fixed_list_len) || (mval < fixedtempvv[fixed_list_len-1])))
1643  {
1644  if ( (fixed_list_len<nbest) && ((0==fixed_list_len) || (mval>fixedtempvv[fixed_list_len-1])) )
1645  {
1646  fixedtempvv[fixed_list_len] = mval ;
1647  fixedtempii[fixed_list_len] = ii + diff*m_N + ts*m_N*nbest;
1648  fixed_list_len++ ;
1649  }
1650  else // must have mval < fixedtempvv[fixed_list_len-1]
1651  {
1652  int32_t addhere = fixed_list_len;
1653  while ((addhere > 0) && (mval < fixedtempvv[addhere-1]))
1654  addhere--;
1655 
1656  // move everything from addhere+1 one forward
1657  for (int32_t jj=fixed_list_len-1; jj>addhere; jj--)
1658  {
1659  fixedtempvv[jj] = fixedtempvv[jj-1];
1660  fixedtempii[jj] = fixedtempii[jj-1];
1661  }
1662 
1663  fixedtempvv[addhere] = mval;
1664  fixedtempii[addhere] = ii + diff*m_N + ts*m_N*nbest;
1665 
1666  if (fixed_list_len < nbest)
1667  fixed_list_len++;
1668  }
1669  }
1670  }
1671  }
1672 #ifdef DYNPROG_TIMING_DETAIL
1673  MyTime.stop() ;
1674  inner_loop_max_time += MyTime.time_diff_sec() ;
1675 #endif
1676  }
1677  }
1678 #ifdef DYNPROG_TIMING
1679  MyTime3.stop() ;
1680  inner_loop_time += MyTime3.time_diff_sec() ;
1681 #endif
1682  }
1683  for (int32_t i=0; i<num_elem; i++)
1684  {
1685  T_STATES ii = elem_list[i] ;
1686 
1687  const CPlifBase* penalty = (CPlifBase*) PEN.element(j,ii) ;
1688 
1689  /*int32_t look_back = max_look_back ;
1690  if (0)
1691  { // find lookback length
1692  CPlifBase *pen = (CPlifBase*) penalty ;
1693  if (pen!=NULL)
1694  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1695  if (look_back>=1e6)
1696  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen) ;
1697  ASSERT(look_back<1e6);
1698  } */
1699 
1700  int32_t look_back_ = look_back.element(j, ii) ;
1701  //int32_t look_back_orig_ = look_back_orig.element(j, ii) ;
1702 
1703  int32_t orf_from = m_orf_info.element(ii,0) ;
1704  int32_t orf_to = m_orf_info.element(j,1) ;
1705  if((orf_from!=-1)!=(orf_to!=-1))
1706  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i]) ;
1707  ASSERT((orf_from!=-1)==(orf_to!=-1)) ;
1708 
1709  int32_t orf_target = -1 ;
1710  if (orf_from!=-1)
1711  {
1712  orf_target=orf_to-orf_from ;
1713  if (orf_target<0)
1714  orf_target+=3 ;
1715  ASSERT(orf_target>=0 && orf_target<3) ;
1716  }
1717 
1718  //int32_t loss_last_pos = t ;
1719  //float64_t last_loss = 0.0 ;
1720 
1721 #ifdef DYNPROG_TIMING
1722  MyTime3.start() ;
1723 #endif
1724 
1725  /* long transition stuff */
1726  /* only do this, if
1727  * this feature is enabled
1728  * this is not a transition with ORF restrictions
1729  * the loss is switched off
1730  * nbest=1
1731  */
1732 #ifdef DYNPROG_TIMING
1733  MyTime3.start() ;
1734 #endif
1735  // long transitions, only when not considering ORFs
1736  if ( long_transitions && orf_target==-1 && look_back_ == m_long_transition_threshold )
1737  {
1738 
1739  // update table for 5' part of the long segment
1740 
1741  int32_t start = long_transition_content_start.get_element(ii, j) ;
1742  int32_t end_5p_part = start ;
1743  for (int32_t start_5p_part=start; m_pos[t]-m_pos[start_5p_part] > m_long_transition_threshold ; start_5p_part++)
1744  {
1745  // find end_5p_part, which is greater than start_5p_part and at least m_long_transition_threshold away
1746  while (end_5p_part<=t && m_pos[end_5p_part+1]-m_pos[start_5p_part]<=m_long_transition_threshold)
1747  end_5p_part++ ;
1748 
1749  ASSERT(m_pos[end_5p_part+1]-m_pos[start_5p_part] > m_long_transition_threshold || end_5p_part==t) ;
1750  ASSERT(m_pos[end_5p_part]-m_pos[start_5p_part] <= m_long_transition_threshold) ;
1751 
1752  float64_t pen_val = 0.0;
1753  /* recompute penalty, if necessary */
1754  if (penalty)
1755  {
1756  int32_t frame = m_orf_info.element(ii,0);
1757  lookup_content_svm_values(start_5p_part, end_5p_part, m_pos[start_5p_part], m_pos[end_5p_part], svm_value, frame); // * t -> end_5p_part
1758  pen_val = penalty->lookup_penalty(m_pos[end_5p_part]-m_pos[start_5p_part], svm_value) ;
1759  }
1760 
1761  /*if (m_pos[start_5p_part]==1003)
1762  {
1763  SG_PRINT("Part1: %i - %i vs %i - %i\n", m_pos[t], m_pos[ts], m_pos[end_5p_part], m_pos[start_5p_part]) ;
1764  SG_PRINT("Part1: ts=%i t=%i start_5p_part=%i m_seq_len=%i\n", m_pos[ts], m_pos[t], m_pos[start_5p_part], m_seq_len) ;
1765  }*/
1766 
1767  float64_t mval_trans = -( elem_val[i] + pen_val*0.5 + delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N) ) ;
1768  //float64_t mval_trans = -( elem_val[i] + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N) ) ; // enable this for the incomplete extra check
1769 
1770  float64_t segment_loss_part1=0.0 ;
1771  if (with_loss)
1772  { // this is the loss from the start of the long segment (5' part + middle section)
1773 
1774  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(start_5p_part /*long_transition_content_start_position.get_element(ii,j)*/, end_5p_part, elem_id[i]); // * unsure
1775 
1776  mval_trans -= segment_loss_part1 ;
1777  }
1778 
1779 
1780  if (0)//m_pos[end_5p_part] - m_pos[long_transition_content_start_position.get_element(ii, j)] > look_back_orig_/*m_long_transition_max*/)
1781  {
1782  // this restricts the maximal length of segments,
1783  // but the current implementation is not valid since the
1784  // long transition is discarded without loocking if there
1785  // is a second best long transition in between
1786  long_transition_content_scores.set_element(-CMath::INFTY, ii, j) ;
1787  long_transition_content_start_position.set_element(0, ii, j) ;
1788  if (with_loss)
1789  long_transition_content_scores_loss.set_element(0.0, ii, j) ;
1790 #ifdef DYNPROG_DEBUG
1791  long_transition_content_scores_pen.set_element(0.0, ii, j) ;
1792  long_transition_content_scores_elem.set_element(0.0, ii, j) ;
1793  long_transition_content_scores_prev.set_element(0.0, ii, j) ;
1794  long_transition_content_end_position.set_element(0, ii, j) ;
1795 #endif
1796  }
1797  if (with_loss)
1798  {
1799  float64_t old_loss = long_transition_content_scores_loss.get_element(ii, j) ;
1800  float64_t new_loss = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), end_5p_part, elem_id[i]);
1801  float64_t score = long_transition_content_scores.get_element(ii, j) - old_loss + new_loss ;
1802  long_transition_content_scores.set_element(score, ii, j) ;
1803  long_transition_content_scores_loss.set_element(new_loss, ii, j) ;
1804 #ifdef DYNPROG_DEBUG
1805  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1806 #endif
1807 
1808  }
1809  if (-long_transition_content_scores.get_element(ii, j) > mval_trans )
1810  {
1811  /* then the old long transition is either too far away or worse than the current one */
1812  long_transition_content_scores.set_element(-mval_trans, ii, j) ;
1813  long_transition_content_start_position.set_element(start_5p_part, ii, j) ;
1814  if (with_loss)
1815  long_transition_content_scores_loss.set_element(segment_loss_part1, ii, j) ;
1816 #ifdef DYNPROG_DEBUG
1817  long_transition_content_scores_pen.set_element(pen_val*0.5, ii, j) ;
1818  long_transition_content_scores_elem.set_element(elem_val[i], ii, j) ;
1819  long_transition_content_scores_prev.set_element(delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N), ii, j) ;
1820  /*ASSERT(fabs(long_transition_content_scores.get_element(ii, j)-(long_transition_content_scores_pen.get_element(ii, j) +
1821  long_transition_content_scores_elem.get_element(ii, j) +
1822  long_transition_content_scores_prev.get_element(ii, j)))<1e-6) ;*/
1823  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1824 #endif
1825  }
1826  //
1827  // this sets the position where the search for better 5'parts is started the next time
1828  // whithout this the prediction takes ages
1829  //
1830  long_transition_content_start.set_element(start_5p_part, ii, j) ;
1831  }
1832 
1833  // consider the 3' part at the end of the long segment:
1834  // * with length = m_long_transition_threshold
1835  // * content prediction and loss only for this part
1836 
1837  // find ts > 0 with distance from m_pos[t] greater m_long_transition_threshold
1838  // precompute: only depends on t
1839  int ts = t;
1840  while (ts>0 && m_pos[t]-m_pos[ts-1] <= m_long_transition_threshold)
1841  ts-- ;
1842 
1843  if (ts>0)
1844  {
1845  ASSERT((m_pos[t]-m_pos[ts-1] > m_long_transition_threshold) && (m_pos[t]-m_pos[ts] <= m_long_transition_threshold)) ;
1846 
1847 
1848  /* only consider this transition, if the right position was found */
1849  float pen_val_3p = 0.0 ;
1850  if (penalty)
1851  {
1852  int32_t frame = orf_from ; //m_orf_info.element(ii, 0);
1853  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1854  pen_val_3p = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1855  }
1856 
1857  float64_t mval = -(long_transition_content_scores.get_element(ii, j) + pen_val_3p*0.5) ;
1858 
1859  {
1860 #ifdef DYNPROG_DEBUG
1861  float64_t segment_loss_part2=0.0 ;
1862  float64_t segment_loss_part1=0.0 ;
1863 #endif
1864  float64_t segment_loss_total=0.0 ;
1865 
1866  if (with_loss)
1867  { // this is the loss for the 3' end fragment of the segment
1868  // (the 5' end and the middle section loss is already contained in mval)
1869 
1870 #ifdef DYNPROG_DEBUG
1871  // this is an alternative, which should be identical, if the loss is additive
1872  segment_loss_part2 = m_seg_loss_obj->get_segment_loss_extend(long_transition_content_end_position.get_element(ii,j), t, elem_id[i]);
1873  //mval -= segment_loss_part2 ;
1874  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), long_transition_content_end_position.get_element(ii,j), elem_id[i]);
1875 #endif
1876  segment_loss_total = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), t, elem_id[i]);
1877  mval -= (segment_loss_total-long_transition_content_scores_loss.get_element(ii, j)) ;
1878  }
1879 
1880 #ifdef DYNPROG_DEBUG
1881  if (m_pos[t]==10108 ||m_pos[t]==12802 ||m_pos[t]== 12561)
1882  {
1883  SG_PRINT("Part2: %i,%i,%i: val=%1.6f pen_val_3p*0.5=%1.6f (t=%i, ts=%i, ts-1=%i, ts+1=%i); scores=%1.6f (pen=%1.6f,prev=%1.6f,elem=%1.6f,loss=%1.1f), positions=%i,%i,%i, loss=%1.1f/%1.1f (%i,%i)\n",
1884  m_pos[t], j, ii, -mval, 0.5*pen_val_3p, m_pos[t], m_pos[ts], m_pos[ts-1], m_pos[ts+1],
1885  long_transition_content_scores.get_element(ii, j),
1886  long_transition_content_scores_pen.get_element(ii, j),
1887  long_transition_content_scores_prev.get_element(ii, j),
1888  long_transition_content_scores_elem.get_element(ii, j),
1889  long_transition_content_scores_loss.get_element(ii, j),
1890  m_pos[long_transition_content_start_position.get_element(ii,j)],
1891  m_pos[long_transition_content_end_position.get_element(ii,j)],
1892  m_pos[long_transition_content_start.get_element(ii,j)], segment_loss_part2, segment_loss_total, long_transition_content_start_position.get_element(ii,j), t) ;
1893  SG_PRINT("fixedtempvv_: %1.6f, from_state:%i from_pos:%i\n ",-fixedtempvv_, (fixedtempii_%m_N), m_pos[(fixedtempii_-(fixedtempii_%(m_N*nbest)))/(m_N*nbest)] );
1894  }
1895 
1896  if (fabs(segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total)>1e-3)
1897  {
1898  SG_ERROR("LOSS: total=%1.1f (%i-%i) part1=%1.1f/%1.1f (%i-%i) part2=%1.1f (%i-%i) sum=%1.1f diff=%1.1f\n",
1899  segment_loss_total, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[t],
1900  long_transition_content_scores_loss.get_element(ii, j), segment_loss_part1, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[long_transition_content_end_position.get_element(ii,j)],
1901  segment_loss_part2, m_pos[long_transition_content_end_position.get_element(ii,j)], m_pos[t],
1902  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j),
1903  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total) ;
1904  }
1905 #endif
1906  }
1907 
1908  // prefer simpler version to guarantee optimality
1909  //
1910  // original:
1911  /* if ((mval < fixedtempvv_) &&
1912  (m_pos[t] - m_pos[long_transition_content_start_position.get_element(ii, j)])<=look_back_orig_) */
1913  if (mval < fixedtempvv_)
1914  {
1915  /* then the long transition is better than the short one => replace it */
1916  int32_t fromtjk = fixedtempii_ ;
1917  /*SG_PRINT("%i,%i: Long transition (%1.5f=-(%1.5f+%1.5f+%1.5f+%1.5f), %i) to m_pos %i better than short transition (%1.5f,%i) to m_pos %i \n",
1918  m_pos[t], j,
1919  mval, pen_val_3p*0.5, long_transition_content_scores_pen.get_element(ii, j), long_transition_content_scores_elem.get_element(ii, j), long_transition_content_scores_prev.get_element(ii, j), ii,
1920  m_pos[long_transition_content_position.get_element(ii, j)],
1921  fixedtempvv_, (fromtjk%m_N), m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]) ;*/
1922  ASSERT((fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)==0 || m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]>=m_pos[long_transition_content_start_position.get_element(ii, j)] || fixedtemplong) ;
1923 
1924  fixedtempvv_ = mval ;
1925  fixedtempii_ = ii + m_N*long_transition_content_start_position.get_element(ii, j) ;
1926  fixed_list_len = 1 ;
1927  fixedtemplong = true ;
1928  }
1929  }
1930  }
1931  }
1932 #ifdef DYNPROG_TIMING
1933  MyTime3.stop() ;
1934  long_transition_time += MyTime3.time_diff_sec() ;
1935 #endif
1936 
1937 
1938  int32_t numEnt = fixed_list_len;
1939 
1940  float64_t minusscore;
1941  int64_t fromtjk;
1942 
1943  for (int16_t k=0; k<nbest; k++)
1944  {
1945  if (k<numEnt)
1946  {
1947  if (nbest==1)
1948  {
1949  minusscore = fixedtempvv_ ;
1950  fromtjk = fixedtempii_ ;
1951  }
1952  else
1953  {
1954  minusscore = fixedtempvv[k];
1955  fromtjk = fixedtempii[k];
1956  }
1957 
1958  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -minusscore + seq.element(j,t);
1959  psi.element(t,j,k) = (fromtjk%m_N) ;
1960  if (nbest>1)
1961  ktable.element(t,j,k) = (fromtjk%(m_N*nbest)-psi.element(t,j,k))/m_N ;
1962  ptable.element(t,j,k) = (fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest) ;
1963  }
1964  else
1965  {
1966  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -CMath::INFTY ;
1967  psi.element(t,j,k) = 0 ;
1968  if (nbest>1)
1969  ktable.element(t,j,k) = 0 ;
1970  ptable.element(t,j,k) = 0 ;
1971  }
1972  }
1973  }
1974  }
1975  }
1976  { //termination
1977  int32_t list_len = 0 ;
1978  for (int16_t diff=0; diff<nbest; diff++)
1979  {
1980  for (T_STATES i=0; i<m_N; i++)
1981  {
1982  oldtempvv[list_len] = -(delta.element(delta_array, (m_seq_len-1), i, diff, m_seq_len, m_N)+get_q(i)) ;
1983  oldtempii[list_len] = i + diff*m_N ;
1984  list_len++ ;
1985  }
1986  }
1987 
1988  CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
1989 
1990  for (int16_t k=0; k<nbest; k++)
1991  {
1992  delta_end.element(k) = -oldtempvv[k] ;
1993  path_ends.element(k) = (oldtempii[k]%m_N) ;
1994  if (nbest>1)
1995  ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/m_N ;
1996  }
1997 
1998 
1999  }
2000 
2001  { //state sequence backtracking
2002  for (int16_t k=0; k<nbest; k++)
2003  {
2004  prob_nbest[k]= delta_end.element(k) ;
2005 
2006  int32_t i = 0 ;
2007  state_seq[i] = path_ends.element(k) ;
2008  int16_t q = 0 ;
2009  if (nbest>1)
2010  q=ktable_end.element(k) ;
2011  pos_seq[i] = m_seq_len-1 ;
2012 
2013  while (pos_seq[i]>0)
2014  {
2015  ASSERT(i+1<m_seq_len);
2016  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q) ;
2017  state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
2018  pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
2019  if (nbest>1)
2020  q = ktable.element(pos_seq[i], state_seq[i], q) ;
2021  i++ ;
2022  }
2023  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q) ;
2024  int32_t num_states = i+1 ;
2025  for (i=0; i<num_states;i++)
2026  {
2027  my_state_seq[i+k*m_seq_len] = state_seq[num_states-i-1] ;
2028  my_pos_seq[i+k*m_seq_len] = pos_seq[num_states-i-1] ;
2029  }
2030  if (num_states<m_seq_len)
2031  {
2032  my_state_seq[num_states+k*m_seq_len]=-1 ;
2033  my_pos_seq[num_states+k*m_seq_len]=-1 ;
2034  }
2035  }
2036  }
2037 
2038  //if (is_big)
2039  // SG_PRINT( "DONE. \n") ;
2040 
2041 
2042 #ifdef DYNPROG_TIMING
2043  MyTime2.stop() ;
2044 
2045  //if (is_big)
2046  SG_PRINT("Timing: orf=%1.2f s \n Segment_init=%1.2f s Segment_pos=%1.2f s Segment_extend=%1.2f s Segment_clean=%1.2f s\nsvm_init=%1.2f s svm_pos=%1.2f svm_clean=%1.2f\n content_svm_values_time=%1.2f content_plifs_time=%1.2f\ninner_loop_max_time=%1.2f inner_loop=%1.2f long_transition_time=%1.2f\n total=%1.2f\n", orf_time, segment_init_time, segment_pos_time, segment_extend_time, segment_clean_time, svm_init_time, svm_pos_time, svm_clean_time, content_svm_values_time, content_plifs_time, inner_loop_max_time, inner_loop_time, long_transition_time, MyTime2.time_diff_sec()) ;
2047 #endif
2048 
2049  SG_FREE(fixedtempvv);
2050  SG_FREE(fixedtempii);
2051  }
2052 
2053 
2055  int32_t *my_state_seq, int32_t *my_pos_seq,
2056  int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals)
2057 {
2061  //m_my_scores.resize_array(m_my_state_seq.get_array_size()) ;
2062  //m_my_losses.resize_array(m_my_state_seq.get_array_size()) ;
2063  m_my_scores.resize_array(my_seq_len);
2064  m_my_losses.resize_array(my_seq_len);
2065  float64_t* my_scores=m_my_scores.get_array();
2066  float64_t* my_losses=m_my_losses.get_array();
2067  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
2068  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
2069 
2070  if (!m_svm_arrays_clean)
2071  {
2072  SG_ERROR( "SVM arrays not clean") ;
2073  return ;
2074  } ;
2075  //SG_PRINT( "genestr_len=%i, genestr_num=%i\n", genestr_len, genestr_num) ;
2076  //m_mod_words.display() ;
2077  //m_sign_words.display() ;
2078  //m_string_words.display() ;
2079 
2080  bool use_svm = false ;
2081 
2082  CDynamicObjectArray PEN((CSGObject**) Plif_matrix, m_N, m_N, false, false) ; // 2d, CPlifBase*
2083  PEN.set_array_name("PEN");
2084 
2085  CDynamicObjectArray PEN_state_signals((CSGObject**) Plif_state_signals, m_N, max_num_signals, false, false) ; // 2d, CPlifBase*
2086  PEN_state_signals.set_array_name("PEN_state_signals");
2087 
2088  CDynamicArray<float64_t> seq_input(seq_array, m_N, m_seq_len, max_num_signals) ;
2089  seq_input.set_array_name("seq_input");
2090 
2091  { // determine whether to use svm outputs and clear derivatives
2092  for (int32_t i=0; i<m_N; i++)
2093  for (int32_t j=0; j<m_N; j++)
2094  {
2095  CPlifBase* penij=(CPlifBase*) PEN.element(i,j) ;
2096  if (penij==NULL)
2097  continue ;
2098 
2099  if (penij->uses_svm_values())
2100  use_svm=true ;
2101  penij->penalty_clear_derivative() ;
2102  }
2103  for (int32_t i=0; i<m_N; i++)
2104  for (int32_t j=0; j<max_num_signals; j++)
2105  {
2106  CPlifBase* penij=(CPlifBase*) PEN_state_signals.element(i,j) ;
2107  if (penij==NULL)
2108  continue ;
2109  if (penij->uses_svm_values())
2110  use_svm=true ;
2111  penij->penalty_clear_derivative() ;
2112  }
2113  }
2114 
2115  { // set derivatives of p, q and a to zero
2116 
2117  for (int32_t i=0; i<m_N; i++)
2118  {
2121  for (int32_t j=0; j<m_N; j++)
2123  }
2124  }
2125 
2126  { // clear score vector
2127  for (int32_t i=0; i<my_seq_len; i++)
2128  {
2129  my_scores[i]=0.0 ;
2130  my_losses[i]=0.0 ;
2131  }
2132  }
2133 
2134  //int32_t total_len = 0 ;
2135 
2136  //m_transition_matrix_a.display_array() ;
2137  //m_transition_matrix_a_id.display_array() ;
2138 
2139  // compute derivatives for given path
2144  {
2145  svm_value[s]=0 ;
2146  svm_value_part1[s]=0 ;
2147  svm_value_part2[s]=0 ;
2148  }
2149 
2150  //#ifdef DYNPROG_DEBUG
2151  float64_t total_score = 0.0 ;
2152  float64_t total_loss = 0.0 ;
2153  //#endif
2154 
2155  ASSERT(my_state_seq[0]>=0) ;
2156  m_initial_state_distribution_p_deriv.element(my_state_seq[0])++ ;
2157  my_scores[0] += m_initial_state_distribution_p.element(my_state_seq[0]) ;
2158 
2159  ASSERT(my_state_seq[my_seq_len-1]>=0) ;
2160  m_end_state_distribution_q_deriv.element(my_state_seq[my_seq_len-1])++ ;
2161  my_scores[my_seq_len-1] += m_end_state_distribution_q.element(my_state_seq[my_seq_len-1]);
2162 
2163  //#ifdef DYNPROG_DEBUG
2164  total_score += my_scores[0] + my_scores[my_seq_len-1] ;
2165  //#endif
2166 
2167  SG_DEBUG( "m_seq_len=%i\n", my_seq_len) ;
2168  for (int32_t i=0; i<my_seq_len-1; i++)
2169  {
2170  if (my_state_seq[i+1]==-1)
2171  break ;
2172  int32_t from_state = my_state_seq[i] ;
2173  int32_t to_state = my_state_seq[i+1] ;
2174  int32_t from_pos = my_pos_seq[i] ;
2175  int32_t to_pos = my_pos_seq[i+1] ;
2176 
2177  int32_t elem_id = m_transition_matrix_a_id.element(from_state, to_state) ;
2178  my_losses[i] = m_seg_loss_obj->get_segment_loss(from_pos, to_pos, elem_id);
2179 
2180 #ifdef DYNPROG_DEBUG
2181 
2182 
2183  if (i>0)// test if segment loss is additive
2184  {
2185  float32_t loss1 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i], elem_id);
2186  float32_t loss2 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i], my_pos_seq[i+1], elem_id);
2187  float32_t loss3 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i+1], elem_id);
2188  SG_PRINT("loss1:%f loss2:%f loss3:%f, diff:%f\n", loss1, loss2, loss3, loss1+loss2-loss3);
2189  if (CMath::abs(loss1+loss2-loss3)>0)
2190  {
2191  SG_PRINT( "%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state) ;
2192  }
2193  }
2194  io->set_loglevel(M_DEBUG) ;
2195  SG_DEBUG( "%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state) ;
2196 #endif
2197  // increase usage of this transition
2198  m_transition_matrix_a_deriv.element(from_state, to_state)++ ;
2199  my_scores[i] += m_transition_matrix_a.element(from_state, to_state) ;
2200  //SG_PRINT("m_transition_matrix_a.element(%i, %i),%f \n",from_state, to_state, m_transition_matrix_a.element(from_state, to_state));
2201 #ifdef DYNPROG_DEBUG
2202  SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
2203 #endif
2204 
2205  /*int32_t last_svm_pos[m_num_degrees] ;
2206  for (int32_t qq=0; qq<m_num_degrees; qq++)
2207  last_svm_pos[qq]=-1 ;*/
2208 
2209  bool is_long_transition = false ;
2210  if (m_long_transitions)
2211  {
2212  if (m_pos[to_pos]-m_pos[from_pos]>m_long_transition_threshold)
2213  is_long_transition = true ;
2214  if (m_orf_info.element(from_state,0)!=-1)
2215  is_long_transition = false ;
2216  }
2217 
2218  int32_t from_pos_thresh = from_pos ;
2219  int32_t to_pos_thresh = to_pos ;
2220 
2221  if (use_svm)
2222  {
2223  if (is_long_transition)
2224  {
2225 
2226  while (from_pos_thresh<to_pos && m_pos[from_pos_thresh+1] - m_pos[from_pos] <= m_long_transition_threshold) // *
2227  from_pos_thresh++ ;
2228  ASSERT(from_pos_thresh<to_pos) ;
2229  ASSERT(m_pos[from_pos_thresh] - m_pos[from_pos] <= m_long_transition_threshold); // *
2230  ASSERT(m_pos[from_pos_thresh+1] - m_pos[from_pos] > m_long_transition_threshold);// *
2231 
2232  int32_t frame = m_orf_info.element(from_state,0);
2233  lookup_content_svm_values(from_pos, from_pos_thresh, m_pos[from_pos], m_pos[from_pos_thresh], svm_value_part1, frame);
2234 
2235 #ifdef DYNPROG_DEBUG
2236  SG_PRINT("part1: pos1: %i pos2: %i pos3: %i \nsvm_value_part1: ", m_pos[from_pos], m_pos[from_pos_thresh], m_pos[from_pos_thresh+1]) ;
2238  SG_PRINT("%1.4f ", svm_value_part1[s]);
2239  SG_PRINT("\n");
2240 #endif
2241 
2242  while (to_pos_thresh>0 && m_pos[to_pos] - m_pos[to_pos_thresh-1] <= m_long_transition_threshold) // *
2243  to_pos_thresh-- ;
2244  ASSERT(to_pos_thresh>0) ;
2245  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh] <= m_long_transition_threshold) ; // *
2246  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh-1] > m_long_transition_threshold) ; // *
2247 
2248  lookup_content_svm_values(to_pos_thresh, to_pos, m_pos[to_pos_thresh], m_pos[to_pos], svm_value_part2, frame);
2249 
2250 #ifdef DYNPROG_DEBUG
2251  SG_PRINT("part2: pos1: %i pos2: %i pos3: %i \nsvm_value_part2: ", m_pos[to_pos], m_pos[to_pos_thresh], m_pos[to_pos_thresh+1]) ;
2253  SG_PRINT("%1.4f ", svm_value_part2[s]);
2254  SG_PRINT("\n");
2255 #endif
2256  }
2257  else
2258  {
2259  /* normal case */
2260 
2261  //SG_PRINT("from_pos: %i; to_pos: %i; m_pos[to_pos]-m_pos[from_pos]: %i \n",from_pos, to_pos, m_pos[to_pos]-m_pos[from_pos]);
2262  int32_t frame = m_orf_info.element(from_state,0);
2263  if (false)//(frame>=0)
2264  {
2265  int32_t num_current_svms=0;
2266  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2267  SG_PRINT("penalties(%i, %i), frame:%i ", from_state, to_state, frame);
2268  ((CPlifBase*) PEN.element(to_state, from_state))->get_used_svms(&num_current_svms, svm_ids);
2269  SG_PRINT("\n");
2270  }
2271 
2272  lookup_content_svm_values(from_pos, to_pos, m_pos[from_pos],m_pos[to_pos], svm_value, frame);
2273 #ifdef DYNPROG_DEBUG
2274  SG_PRINT("part2: pos1: %i pos2: %i \nsvm_values: ", m_pos[from_pos], m_pos[to_pos]) ;
2276  SG_PRINT("%1.4f ", svm_value[s]);
2277  SG_PRINT("\n");
2278 #endif
2279  }
2280  }
2281 
2282  if (PEN.element(to_state, from_state)!=NULL)
2283  {
2284  float64_t nscore = 0 ;
2285  if (is_long_transition)
2286  {
2287  float64_t pen_value_part1 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1) ;
2288  float64_t pen_value_part2 = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2) ;
2289  nscore= 0.5*pen_value_part1 + 0.5*pen_value_part2 ;
2290  }
2291  else
2292  nscore = ((CPlifBase*) PEN.element(to_state, from_state))->lookup_penalty(m_pos[to_pos]-m_pos[from_pos], svm_value) ;
2293 
2294  if (false)//(nscore<-1e9)
2295  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f\n",
2296  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state, nscore) ;
2297 
2298  my_scores[i] += nscore ;
2299 
2300  for (int32_t s=m_num_svms;s<m_num_lin_feat_plifs_cum[m_num_raw_data]; s++)/*set tiling plif values to neutral values (that do not influence derivative calculation)*/
2301  {
2302  svm_value[s]=-CMath::INFTY;
2303  svm_value_part1[s]=-CMath::INFTY;
2304  svm_value_part2[s]=-CMath::INFTY;
2305  }
2306 
2307 #ifdef DYNPROG_DEBUG
2308  //SG_DEBUG( "%i. transition penalty: from_state=%i to_state=%i from_pos=%i [%i] to_pos=%i [%i] value=%i\n", i, from_state, to_state, from_pos, m_pos[from_pos], to_pos, m_pos[to_pos], m_pos[to_pos]-m_pos[from_pos]) ;
2309 #endif
2310  if (is_long_transition)
2311  {
2312 #ifdef DYNPROG_DEBUG
2313  float64_t sum_score = 0.0 ;
2314 
2315  for (int kk=0; kk<i; kk++)
2316  sum_score += my_scores[i] ;
2317 
2318  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f, %1.5f --- 1: %1.6f (%i-%i) 2: %1.6f (%i-%i) \n",
2319  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state,
2320  nscore, sum_score,
2321  PEN.element(to_state, from_state)->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1)*0.5, m_pos[from_pos], m_pos[from_pos_thresh],
2322  PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2)*0.5, m_pos[to_pos_thresh], m_pos[to_pos]) ;
2323 #endif
2324  }
2325 
2326  if (is_long_transition)
2327  {
2328  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1, 0.5) ;
2329  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2, 0.5) ;
2330  }
2331  else
2332  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(m_pos[to_pos]-m_pos[from_pos], svm_value, 1) ;
2333 
2334  //SG_PRINT("m_num_raw_data = %i \n", m_num_raw_data) ;
2335 
2336  // for tiling array and rna-seq data every single measurement must be added to the derivative
2337  // in contrast to the content svm predictions where we have a single value per transition;
2338  // content svm predictions have already been added to the derivative, thus we start with d=1
2339  // instead of d=0
2340  if (is_long_transition)
2341  {
2342  for (int32_t d=1; d<=m_num_raw_data; d++)
2343  {
2344  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2345  svm_value[s]=-CMath::INFTY;
2346  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2347  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[from_pos_thresh],intensities, d);
2348  for (int32_t k=0;k<num_intensities;k++)
2349  {
2350  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2351  svm_value[j]=intensities[k];
2352 
2353  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2354 
2355  }
2356  num_intensities = raw_intensities_interval_query(m_pos[to_pos_thresh], m_pos[to_pos],intensities, d);
2357  for (int32_t k=0;k<num_intensities;k++)
2358  {
2359  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2360  svm_value[j]=intensities[k];
2361 
2362  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2363 
2364  }
2365  SG_FREE(intensities);
2366 
2367  }
2368  }
2369  else
2370  {
2371  for (int32_t d=1; d<=m_num_raw_data; d++)
2372  {
2373  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2374  svm_value[s]=-CMath::INFTY;
2375  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2376  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[to_pos],intensities, d);
2377  //SG_PRINT("m_pos[from_pos]:%i, m_pos[to_pos]:%i, num_intensities:%i\n",m_pos[from_pos],m_pos[to_pos], num_intensities);
2378  for (int32_t k=0;k<num_intensities;k++)
2379  {
2380  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2381  svm_value[j]=intensities[k];
2382 
2383  ((CPlifBase*) PEN.element(to_state, from_state))->penalty_add_derivative(-CMath::INFTY, svm_value, 1) ;
2384 
2385  }
2386  SG_FREE(intensities);
2387  }
2388  }
2389 
2390  }
2391 #ifdef DYNPROG_DEBUG
2392  SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
2393 #endif
2394 
2395  //SG_DEBUG( "emmission penalty skipped: to_state=%i to_pos=%i value=%1.2f score=%1.2f\n", to_state, to_pos, seq_input.element(to_state, to_pos), 0.0) ;
2396  for (int32_t k=0; k<max_num_signals; k++)
2397  {
2398  if ((PEN_state_signals.element(to_state,k)==NULL)&&(k==0))
2399  {
2400 #ifdef DYNPROG_DEBUG
2401  SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i score=%1.2f (no signal plif)\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k)) ;
2402 #endif
2403  my_scores[i] += seq_input.element(to_state, to_pos, k) ;
2404  //if (seq_input.element(to_state, to_pos, k) !=0)
2405  // SG_PRINT("features(%i,%i): %f\n",to_state,to_pos,seq_input.element(to_state, to_pos, k));
2406  break ;
2407  }
2408  if (PEN_state_signals.element(to_state, k)!=NULL)
2409  {
2410  float64_t nscore = ((CPlifBase*) PEN_state_signals.element(to_state,k))->lookup_penalty(seq_input.element(to_state, to_pos, k), svm_value) ; // this should be ok for long_transitions (svm_value does not matter)
2411  my_scores[i] += nscore ;
2412 #ifdef DYNPROG_DEBUG
2413  if (false)//(nscore<-1e9)
2414  {
2415  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), from_state=%i, to_pos=%i (%i) to_state=%i=> %1.5f, dim3:%i, seq_input.element(to_state, to_pos, k): %1.4f\n",
2416  is_long_transition, m_pos[from_pos], from_pos, from_state, m_pos[to_pos], to_pos, to_state, nscore, k, seq_input.element(to_state, to_pos, k)) ;
2417  for (int x=0; x<23; x++)
2418  {
2419  for (int i=-10; i<10; i++)
2420  SG_PRINT("%1.4f\t", seq_input.element(x, to_pos+i, k));
2421  SG_PRINT("\n");
2422  }
2423 
2424  }
2425 #endif
2426  //break ;
2427  //int32_t num_current_svms=0;
2428  //int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2429  //SG_PRINT("PEN_state_signals->id: ");
2430  //PEN_state_signals.element(to_state, k)->get_used_svms(&num_current_svms, svm_ids);
2431  //SG_PRINT("\n");
2432  //if (nscore != 0)
2433  //SG_PRINT( "%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k) ;
2434 #ifdef DYNPROG_DEBUG
2435  SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k) ;
2436 #endif
2437  ((CPlifBase*) PEN_state_signals.element(to_state,k))->penalty_add_derivative(seq_input.element(to_state, to_pos, k), svm_value, 1) ; // this should be ok for long_transitions (svm_value does not matter)
2438  } else
2439  break ;
2440  }
2441 
2442  //#ifdef DYNPROG_DEBUG
2443  //SG_PRINT( "scores[%i]=%f (final) \n", i, my_scores[i]) ;
2444  //SG_PRINT( "losses[%i]=%f (final) , total_loss: %f \n", i, my_losses[i], total_loss) ;
2445  total_score += my_scores[i] ;
2446  total_loss += my_losses[i] ;
2447  //#endif
2448  }
2449  //#ifdef DYNPROG_DEBUG
2450  //SG_PRINT( "total score = %f \n", total_score) ;
2451  //SG_PRINT( "total loss = %f \n", total_loss) ;
2452  //#endif
2453  SG_FREE(svm_value);
2454  SG_FREE(svm_value_part1);
2455  SG_FREE(svm_value_part2);
2456 }
2457 
2458 int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
2459 {
2460  ASSERT(from_pos<to_pos);
2461  int32_t num_intensities = 0;
2462  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[type-1]];
2463  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
2464  int32_t last_pos;
2465  int32_t num = m_num_probes_cum[type-1];
2466  while (*p_tiling_pos<to_pos)
2467  {
2468  if (*p_tiling_pos>=from_pos)
2469  {
2470  intensities[num_intensities] = *p_tiling_data;
2471  num_intensities++;
2472  }
2473  num++;
2474  if (num>=m_num_probes_cum[type])
2475  break;
2476  last_pos = *p_tiling_pos;
2477  p_tiling_pos++;
2478  p_tiling_data++;
2479  ASSERT(last_pos<*p_tiling_pos);
2480  }
2481  return num_intensities;
2482 }
2483 
2484 void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
2485 {
2486 #ifdef DYNPROG_TIMING_DETAIL
2487  MyTime.start() ;
2488 #endif
2489 // ASSERT(from_state<to_state);
2490 // if (!(from_pos<to_pos))
2491 // SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos);
2492  for (int32_t i=0;i<m_num_svms;i++)
2493  {
2494  float64_t to_val = m_lin_feat.get_element(i, to_state);
2495  float64_t from_val = m_lin_feat.get_element(i, from_state);
2496  svm_values[i] = (to_val-from_val)/(to_pos-from_pos);
2497  }
2498  for (int32_t i=m_num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
2499  {
2500  float64_t to_val = m_lin_feat.get_element(i, to_state);
2501  float64_t from_val = m_lin_feat.get_element(i, from_state);
2502  svm_values[i] = to_val-from_val ;
2503  }
2504  if (m_intron_list)
2505  {
2506  int32_t* support = SG_MALLOC(int32_t, m_num_intron_plifs);
2507  m_intron_list->get_intron_support(support, from_state, to_state);
2508  int32_t intron_list_start = m_num_lin_feat_plifs_cum[m_num_raw_data];
2509  int32_t intron_list_end = m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;
2510  int32_t cnt = 0;
2511  for (int32_t i=intron_list_start; i<intron_list_end;i++)
2512  {
2513  svm_values[i] = (float64_t) (support[cnt]);
2514  cnt++;
2515  }
2516  //if (to_pos>3990 && to_pos<4010)
2517  // SG_PRINT("from_state:%i to_state:%i support[0]:%i support[1]:%i\n",from_state, to_state, support[0], support[1]);
2518  SG_FREE(support);
2519  }
2520  // find the correct row with precomputed frame predictions
2521  if (frame!=-1)
2522  {
2523  svm_values[frame_plifs[0]] = 1e10;
2524  svm_values[frame_plifs[1]] = 1e10;
2525  svm_values[frame_plifs[2]] = 1e10;
2526  int32_t global_frame = from_pos%3;
2527  int32_t row = ((global_frame+frame)%3)+4;
2528  float64_t to_val = m_lin_feat.get_element(row, to_state);
2529  float64_t from_val = m_lin_feat.get_element(row, from_state);
2530  svm_values[frame+frame_plifs[0]] = (to_val-from_val)/(to_pos-from_pos);
2531  }
2532 #ifdef DYNPROG_TIMING_DETAIL
2533  MyTime.stop() ;
2534  content_svm_values_time += MyTime.time_diff_sec() ;
2535 #endif
2536 }
2537 void CDynProg::set_intron_list(CIntronList* intron_list, int32_t num_plifs)
2538 {
2539  m_intron_list = intron_list;
2540  m_num_intron_plifs = num_plifs;
2541 }
2542 

SHOGUN Machine Learning Toolbox - Documentation