00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "kernel/SqrtDiagKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #include "classifier/svm/SVM.h"
00024
00025 #ifndef WIN32
00026 #include <pthread.h>
00027 #endif
00028
00029 using namespace shogun;
00030
00031 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00032
00033 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00034 template <class Trie> struct S_THREAD_PARAM
00035 {
00036 int32_t* vec;
00037 float64_t* result;
00038 float64_t* weights;
00039 CWeightedDegreePositionStringKernel* kernel;
00040 CTrie<Trie>* tries;
00041 float64_t factor;
00042 int32_t j;
00043 int32_t start;
00044 int32_t end;
00045 int32_t length;
00046 int32_t max_shift;
00047 int32_t* shift;
00048 int32_t* vec_idx;
00049 };
00050 #endif // DOXYGEN_SHOULD_SKIP_THIS
00051
00052 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00053 void)
00054 : CStringKernel<char>()
00055 {
00056 init();
00057 }
00058
00059 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00060 int32_t size, int32_t d, int32_t mm, int32_t mkls)
00061 : CStringKernel<char>(size)
00062 {
00063 init();
00064
00065 mkl_stepsize=mkls;
00066 degree=d;
00067 max_mismatch=mm;
00068
00069 tries=CTrie<DNATrie>(d);
00070 poim_tries=CTrie<POIMTrie>(d);
00071
00072 set_wd_weights();
00073 ASSERT(weights);
00074 }
00075
00076 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00077 int32_t size, float64_t* w, int32_t d, int32_t mm, int32_t* s, int32_t sl,
00078 int32_t mkls)
00079 : CStringKernel<char>(size)
00080 {
00081 init();
00082
00083 mkl_stepsize=mkls;
00084 degree=d;
00085 max_mismatch=mm;
00086
00087 tries=CTrie<DNATrie>(d);
00088 poim_tries=CTrie<POIMTrie>(d);
00089
00090 weights=new float64_t[d*(1+max_mismatch)];
00091 weights_degree=degree;
00092 weights_length=(1+max_mismatch);
00093
00094 for (int32_t i=0; i<d*(1+max_mismatch); i++)
00095 weights[i]=w[i];
00096
00097 set_shifts(s, sl);
00098 }
00099
00100 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00101 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00102 : CStringKernel<char>()
00103 {
00104 init();
00105
00106 mkl_stepsize=1;
00107 degree=d;
00108
00109 tries=CTrie<DNATrie>(d);
00110 poim_tries=CTrie<POIMTrie>(d);
00111
00112 set_wd_weights();
00113 ASSERT(weights);
00114
00115 init(l, r);
00116 }
00117
00118
00119 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00120 {
00121 cleanup();
00122 cleanup_POIM2();
00123
00124 delete[] shift;
00125 shift=NULL;
00126
00127 delete[] weights;
00128 weights=NULL;
00129 weights_degree=0;
00130 weights_length=0;
00131
00132 delete[] block_weights;
00133 block_weights=NULL;
00134
00135 delete[] position_weights;
00136 position_weights=NULL;
00137
00138 delete[] position_weights_lhs;
00139 position_weights_lhs=NULL;
00140
00141 delete[] position_weights_rhs;
00142 position_weights_rhs=NULL;
00143
00144 delete[] weights_buffer;
00145 weights_buffer=NULL;
00146 }
00147
00148 void CWeightedDegreePositionStringKernel::remove_lhs()
00149 {
00150 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00151 delete_optimization();
00152
00153 tries.destroy();
00154 poim_tries.destroy();
00155
00156 CKernel::remove_lhs();
00157 }
00158
00159 void CWeightedDegreePositionStringKernel::create_empty_tries()
00160 {
00161 ASSERT(lhs);
00162 seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
00163
00164 if (opt_type==SLOWBUTMEMEFFICIENT)
00165 {
00166 tries.create(seq_length, true);
00167 poim_tries.create(seq_length, true);
00168 }
00169 else if (opt_type==FASTBUTMEMHUNGRY)
00170 {
00171 tries.create(seq_length, false);
00172 poim_tries.create(seq_length, false);
00173 }
00174 else
00175 SG_ERROR( "unknown optimization type\n");
00176 }
00177
00178 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00179 {
00180 int32_t lhs_changed = (lhs!=l) ;
00181 int32_t rhs_changed = (rhs!=r) ;
00182
00183 CStringKernel<char>::init(l,r);
00184
00185 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00186 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00187
00188 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00189 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00190
00191
00192 if (shift_len==0) {
00193 shift_len=sf_l->get_vector_length(0);
00194 int32_t *shifts=new int32_t[shift_len];
00195 for (int32_t i=0; i<shift_len; i++) {
00196 shifts[i]=1;
00197 }
00198 set_shifts(shifts, shift_len);
00199 delete[] shifts;
00200 }
00201
00202
00203 int32_t len=sf_l->get_max_vector_length();
00204 if (lhs_changed && !sf_l->have_same_length(len))
00205 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00206
00207 if (rhs_changed && !sf_r->have_same_length(len))
00208 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00209
00210 SG_UNREF(alphabet);
00211 alphabet= sf_l->get_alphabet();
00212 CAlphabet* ralphabet=sf_r->get_alphabet();
00213
00214 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00215 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00216
00217 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00218 SG_UNREF(ralphabet);
00219
00220
00221 create_empty_tries();
00222 init_block_weights();
00223
00224 return init_normalizer();
00225 }
00226
00227 void CWeightedDegreePositionStringKernel::cleanup()
00228 {
00229 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00230 delete_optimization();
00231
00232 delete[] block_weights;
00233 block_weights=NULL;
00234
00235 tries.destroy();
00236 poim_tries.destroy();
00237
00238 seq_length = 0;
00239 tree_initialized = false;
00240
00241 SG_UNREF(alphabet);
00242 alphabet=NULL;
00243
00244 CKernel::cleanup();
00245 }
00246
00247 bool CWeightedDegreePositionStringKernel::init_optimization(
00248 int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
00249 int32_t upto_tree)
00250 {
00251 ASSERT(position_weights_lhs==NULL);
00252 ASSERT(position_weights_rhs==NULL);
00253
00254 if (upto_tree<0)
00255 upto_tree=tree_num;
00256
00257 if (max_mismatch!=0)
00258 {
00259 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00260 return false ;
00261 }
00262
00263 if (tree_num<0)
00264 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00265
00266 delete_optimization();
00267
00268 if (tree_num<0)
00269 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00270
00271 for (int32_t i=0; i<p_count; i++)
00272 {
00273 if (tree_num<0)
00274 {
00275 if ( (i % (p_count/10+1)) == 0)
00276 SG_PROGRESS(i,0,p_count);
00277 add_example_to_tree(IDX[i], alphas[i]);
00278 }
00279 else
00280 {
00281 for (int32_t t=tree_num; t<=upto_tree; t++)
00282 add_example_to_single_tree(IDX[i], alphas[i], t);
00283 }
00284 }
00285
00286 if (tree_num<0)
00287 SG_DONE();
00288
00289 set_is_initialized(true) ;
00290 return true ;
00291 }
00292
00293 bool CWeightedDegreePositionStringKernel::delete_optimization()
00294 {
00295 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00296 {
00297 tries.set_use_compact_terminal_nodes(false) ;
00298 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00299 }
00300
00301 if (get_is_initialized())
00302 {
00303 if (opt_type==SLOWBUTMEMEFFICIENT)
00304 tries.delete_trees(true);
00305 else if (opt_type==FASTBUTMEMHUNGRY)
00306 tries.delete_trees(false);
00307 else {
00308 SG_ERROR( "unknown optimization type\n");
00309 }
00310 set_is_initialized(false);
00311
00312 return true;
00313 }
00314
00315 return false;
00316 }
00317
00318 float64_t CWeightedDegreePositionStringKernel::compute_with_mismatch(
00319 char* avec, int32_t alen, char* bvec, int32_t blen)
00320 {
00321 float64_t* max_shift_vec= new float64_t[max_shift];
00322 float64_t sum0=0 ;
00323 for (int32_t i=0; i<max_shift; i++)
00324 max_shift_vec[i]=0 ;
00325
00326
00327 for (int32_t i=0; i<alen; i++)
00328 {
00329 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00330 continue ;
00331
00332 int32_t mismatches=0;
00333 float64_t sumi = 0.0 ;
00334 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00335 {
00336 if (avec[i+j]!=bvec[i+j])
00337 {
00338 mismatches++ ;
00339 if (mismatches>max_mismatch)
00340 break ;
00341 } ;
00342 sumi += weights[j+degree*mismatches];
00343 }
00344 if (position_weights!=NULL)
00345 sum0 += position_weights[i]*sumi ;
00346 else
00347 sum0 += sumi ;
00348 } ;
00349
00350 for (int32_t i=0; i<alen; i++)
00351 {
00352 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00353 {
00354 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00355 continue ;
00356
00357 float64_t sumi1 = 0.0 ;
00358
00359 int32_t mismatches=0;
00360 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00361 {
00362 if (avec[i+j+k]!=bvec[i+j])
00363 {
00364 mismatches++ ;
00365 if (mismatches>max_mismatch)
00366 break ;
00367 } ;
00368 sumi1 += weights[j+degree*mismatches];
00369 }
00370 float64_t sumi2 = 0.0 ;
00371
00372 mismatches=0;
00373 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00374 {
00375 if (avec[i+j]!=bvec[i+j+k])
00376 {
00377 mismatches++ ;
00378 if (mismatches>max_mismatch)
00379 break ;
00380 } ;
00381 sumi2 += weights[j+degree*mismatches];
00382 }
00383 if (position_weights!=NULL)
00384 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00385 else
00386 max_shift_vec[k-1] += sumi1 + sumi2 ;
00387 } ;
00388 }
00389
00390 float64_t result = sum0 ;
00391 for (int32_t i=0; i<max_shift; i++)
00392 result += max_shift_vec[i]/(2*(i+1)) ;
00393
00394 delete[] max_shift_vec;
00395 return result ;
00396 }
00397
00398 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch(
00399 char* avec, int32_t alen, char* bvec, int32_t blen)
00400 {
00401 float64_t* max_shift_vec = new float64_t[max_shift];
00402 float64_t sum0=0 ;
00403 for (int32_t i=0; i<max_shift; i++)
00404 max_shift_vec[i]=0 ;
00405
00406
00407 for (int32_t i=0; i<alen; i++)
00408 {
00409 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00410 continue ;
00411
00412 float64_t sumi = 0.0 ;
00413 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00414 {
00415 if (avec[i+j]!=bvec[i+j])
00416 break ;
00417 sumi += weights[j];
00418 }
00419 if (position_weights!=NULL)
00420 sum0 += position_weights[i]*sumi ;
00421 else
00422 sum0 += sumi ;
00423 } ;
00424
00425 for (int32_t i=0; i<alen; i++)
00426 {
00427 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00428 {
00429 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00430 continue ;
00431
00432 float64_t sumi1 = 0.0 ;
00433
00434 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00435 {
00436 if (avec[i+j+k]!=bvec[i+j])
00437 break ;
00438 sumi1 += weights[j];
00439 }
00440 float64_t sumi2 = 0.0 ;
00441
00442 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00443 {
00444 if (avec[i+j]!=bvec[i+j+k])
00445 break ;
00446 sumi2 += weights[j];
00447 }
00448 if (position_weights!=NULL)
00449 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00450 else
00451 max_shift_vec[k-1] += sumi1 + sumi2 ;
00452 } ;
00453 }
00454
00455 float64_t result = sum0 ;
00456 for (int32_t i=0; i<max_shift; i++)
00457 result += max_shift_vec[i]/(2*(i+1)) ;
00458
00459 delete[] max_shift_vec;
00460
00461 return result ;
00462 }
00463
00464 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(
00465 char* avec, int32_t alen, char* bvec, int32_t blen)
00466 {
00467 float64_t* max_shift_vec = new float64_t[max_shift];
00468 float64_t sum0=0 ;
00469 for (int32_t i=0; i<max_shift; i++)
00470 max_shift_vec[i]=0 ;
00471
00472
00473 for (int32_t i=0; i<alen; i++)
00474 {
00475 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00476 continue ;
00477 float64_t sumi = 0.0 ;
00478 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00479 {
00480 if (avec[i+j]!=bvec[i+j])
00481 break ;
00482 sumi += weights[i*degree+j];
00483 }
00484 if (position_weights!=NULL)
00485 sum0 += position_weights[i]*sumi ;
00486 else
00487 sum0 += sumi ;
00488 } ;
00489
00490 for (int32_t i=0; i<alen; i++)
00491 {
00492 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00493 {
00494 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00495 continue ;
00496
00497 float64_t sumi1 = 0.0 ;
00498
00499 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00500 {
00501 if (avec[i+j+k]!=bvec[i+j])
00502 break ;
00503 sumi1 += weights[i*degree+j];
00504 }
00505 float64_t sumi2 = 0.0 ;
00506
00507 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00508 {
00509 if (avec[i+j]!=bvec[i+j+k])
00510 break ;
00511 sumi2 += weights[i*degree+j];
00512 }
00513 if (position_weights!=NULL)
00514 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00515 else
00516 max_shift_vec[k-1] += sumi1 + sumi2 ;
00517 } ;
00518 }
00519
00520 float64_t result = sum0 ;
00521 for (int32_t i=0; i<max_shift; i++)
00522 result += max_shift_vec[i]/(2*(i+1)) ;
00523
00524 delete[] max_shift_vec;
00525 return result ;
00526 }
00527
00528 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(
00529 char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
00530 float64_t* pos_weights_rhs, int32_t blen)
00531 {
00532 float64_t* max_shift_vec = new float64_t[max_shift];
00533 float64_t sum0=0 ;
00534 for (int32_t i=0; i<max_shift; i++)
00535 max_shift_vec[i]=0 ;
00536
00537
00538 for (int32_t i=0; i<alen; i++)
00539 {
00540 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00541 continue ;
00542
00543 float64_t sumi = 0.0 ;
00544 float64_t posweight_lhs = 0.0 ;
00545 float64_t posweight_rhs = 0.0 ;
00546 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00547 {
00548 posweight_lhs += pos_weights_lhs[i+j] ;
00549 posweight_rhs += pos_weights_rhs[i+j] ;
00550
00551 if (avec[i+j]!=bvec[i+j])
00552 break ;
00553 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00554 }
00555 if (position_weights!=NULL)
00556 sum0 += position_weights[i]*sumi ;
00557 else
00558 sum0 += sumi ;
00559 } ;
00560
00561 for (int32_t i=0; i<alen; i++)
00562 {
00563 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00564 {
00565 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00566 continue ;
00567
00568
00569 float64_t sumi1 = 0.0 ;
00570 float64_t posweight_lhs = 0.0 ;
00571 float64_t posweight_rhs = 0.0 ;
00572 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00573 {
00574 posweight_lhs += pos_weights_lhs[i+j+k] ;
00575 posweight_rhs += pos_weights_rhs[i+j] ;
00576 if (avec[i+j+k]!=bvec[i+j])
00577 break ;
00578 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00579 }
00580
00581 float64_t sumi2 = 0.0 ;
00582 posweight_lhs = 0.0 ;
00583 posweight_rhs = 0.0 ;
00584 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00585 {
00586 posweight_lhs += pos_weights_lhs[i+j] ;
00587 posweight_rhs += pos_weights_rhs[i+j+k] ;
00588 if (avec[i+j]!=bvec[i+j+k])
00589 break ;
00590 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00591 }
00592 if (position_weights!=NULL)
00593 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00594 else
00595 max_shift_vec[k-1] += sumi1 + sumi2 ;
00596 } ;
00597 }
00598
00599 float64_t result = sum0 ;
00600 for (int32_t i=0; i<max_shift; i++)
00601 result += max_shift_vec[i]/(2*(i+1)) ;
00602
00603 delete[] max_shift_vec;
00604 return result ;
00605 }
00606
00607
00608 float64_t CWeightedDegreePositionStringKernel::compute(
00609 int32_t idx_a, int32_t idx_b)
00610 {
00611 int32_t alen, blen;
00612 bool free_avec, free_bvec;
00613
00614 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00615 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00616
00617 ASSERT(alen==blen);
00618 ASSERT(shift_len==alen);
00619
00620 float64_t result = 0 ;
00621 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00622 {
00623 ASSERT(max_mismatch==0);
00624 float64_t* position_weights_rhs_ = position_weights_rhs ;
00625 if (lhs==rhs)
00626 position_weights_rhs_ = position_weights_lhs ;
00627
00628 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
00629 }
00630 else if (max_mismatch > 0)
00631 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00632 else if (length==0)
00633 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00634 else
00635 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00636
00637 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00638 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00639
00640 return result ;
00641 }
00642
00643
00644 void CWeightedDegreePositionStringKernel::add_example_to_tree(
00645 int32_t idx, float64_t alpha)
00646 {
00647 ASSERT(position_weights_lhs==NULL);
00648 ASSERT(position_weights_rhs==NULL);
00649 ASSERT(alphabet);
00650 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00651
00652 int32_t len=0;
00653 bool free_vec;
00654 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00655 ASSERT(max_mismatch==0);
00656 int32_t *vec = new int32_t[len] ;
00657
00658 for (int32_t i=0; i<len; i++)
00659 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00660 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00661
00662 if (opt_type==FASTBUTMEMHUNGRY)
00663 {
00664
00665 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00666 }
00667
00668 for (int32_t i=0; i<len; i++)
00669 {
00670 int32_t max_s=-1;
00671
00672 if (opt_type==SLOWBUTMEMEFFICIENT)
00673 max_s=0;
00674 else if (opt_type==FASTBUTMEMHUNGRY)
00675 max_s=shift[i];
00676 else {
00677 SG_ERROR( "unknown optimization type\n");
00678 }
00679
00680 for (int32_t s=max_s; s>=0; s--)
00681 {
00682 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00683 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00684 if ((s==0) || (i+s>=len))
00685 continue;
00686
00687 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00688 }
00689 }
00690
00691 delete[] vec ;
00692 tree_initialized=true ;
00693 }
00694
00695 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(
00696 int32_t idx, float64_t alpha, int32_t tree_num)
00697 {
00698 ASSERT(position_weights_lhs==NULL);
00699 ASSERT(position_weights_rhs==NULL);
00700 ASSERT(alphabet);
00701 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00702
00703 int32_t len=0;
00704 bool free_vec;
00705 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00706 ASSERT(max_mismatch==0);
00707 int32_t *vec=new int32_t[len];
00708 int32_t max_s=-1;
00709
00710 if (opt_type==SLOWBUTMEMEFFICIENT)
00711 max_s=0;
00712 else if (opt_type==FASTBUTMEMHUNGRY)
00713 {
00714 ASSERT(!tries.get_use_compact_terminal_nodes());
00715 max_s=shift[tree_num];
00716 }
00717 else {
00718 SG_ERROR( "unknown optimization type\n");
00719 }
00720 for (int32_t i=CMath::max(0,tree_num-max_shift);
00721 i<CMath::min(len,tree_num+degree+max_shift); i++)
00722 {
00723 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00724 }
00725 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00726
00727 for (int32_t s=max_s; s>=0; s--)
00728 {
00729 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00730 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00731 }
00732
00733 if (opt_type==FASTBUTMEMHUNGRY)
00734 {
00735 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00736 {
00737 int32_t s=tree_num-i;
00738 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00739 {
00740 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00741 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00742 }
00743 }
00744 }
00745 delete[] vec ;
00746 tree_initialized=true ;
00747 }
00748
00749 float64_t CWeightedDegreePositionStringKernel::compute_by_tree(int32_t idx)
00750 {
00751 ASSERT(position_weights_lhs==NULL);
00752 ASSERT(position_weights_rhs==NULL);
00753 ASSERT(alphabet);
00754 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00755
00756 float64_t sum=0;
00757 int32_t len=0;
00758 bool free_vec;
00759 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00760 ASSERT(max_mismatch==0);
00761 int32_t *vec=new int32_t[len];
00762
00763 for (int32_t i=0; i<len; i++)
00764 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00765
00766 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00767
00768 for (int32_t i=0; i<len; i++)
00769 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00770
00771 if (opt_type==SLOWBUTMEMEFFICIENT)
00772 {
00773 for (int32_t i=0; i<len; i++)
00774 {
00775 for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
00776 {
00777 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00778 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00779 }
00780 }
00781 }
00782
00783 delete[] vec ;
00784
00785 return normalizer->normalize_rhs(sum, idx);
00786 }
00787
00788 void CWeightedDegreePositionStringKernel::compute_by_tree(
00789 int32_t idx, float64_t* LevelContrib)
00790 {
00791 ASSERT(position_weights_lhs==NULL);
00792 ASSERT(position_weights_rhs==NULL);
00793 ASSERT(alphabet);
00794 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00795
00796 int32_t len=0;
00797 bool free_vec;
00798 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00799 ASSERT(max_mismatch==0);
00800 int32_t *vec=new int32_t[len];
00801
00802 for (int32_t i=0; i<len; i++)
00803 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00804
00805 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00806
00807 for (int32_t i=0; i<len; i++)
00808 {
00809 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00810 normalizer->normalize_rhs(1.0, idx), mkl_stepsize, weights,
00811 (length!=0));
00812 }
00813
00814 if (opt_type==SLOWBUTMEMEFFICIENT)
00815 {
00816 for (int32_t i=0; i<len; i++)
00817 for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
00818 {
00819 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
00820 normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
00821 weights, (length!=0)) ;
00822 tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
00823 LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
00824 mkl_stepsize, weights, (length!=0)) ;
00825 }
00826 }
00827
00828 delete[] vec ;
00829 }
00830
00831 float64_t* CWeightedDegreePositionStringKernel::compute_abs_weights(
00832 int32_t &len)
00833 {
00834 return tries.compute_abs_weights(len);
00835 }
00836
00837 bool CWeightedDegreePositionStringKernel::set_shifts(
00838 int32_t* shift_, int32_t shift_len_)
00839 {
00840 delete[] shift;
00841
00842 shift_len = shift_len_ ;
00843 shift = new int32_t[shift_len] ;
00844
00845 if (shift)
00846 {
00847 max_shift = 0 ;
00848
00849 for (int32_t i=0; i<shift_len; i++)
00850 {
00851 shift[i] = shift_[i] ;
00852 max_shift = CMath::max(shift[i], max_shift);
00853 }
00854
00855 ASSERT(max_shift>=0 && max_shift<=shift_len);
00856 }
00857
00858 return false;
00859 }
00860
00861 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00862 {
00863 ASSERT(degree>0);
00864
00865 delete[] weights;
00866 weights=new float64_t[degree];
00867 weights_degree=degree;
00868 weights_length=1;
00869
00870 if (weights)
00871 {
00872 int32_t i;
00873 float64_t sum=0;
00874 for (i=0; i<degree; i++)
00875 {
00876 weights[i]=degree-i;
00877 sum+=weights[i];
00878 }
00879 for (i=0; i<degree; i++)
00880 weights[i]/=sum;
00881
00882 for (i=0; i<degree; i++)
00883 {
00884 for (int32_t j=1; j<=max_mismatch; j++)
00885 {
00886 if (j<i+1)
00887 {
00888 int32_t nk=CMath::nchoosek(i+1, j);
00889 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00890 }
00891 else
00892 weights[i+j*degree]= 0;
00893 }
00894 }
00895
00896 return true;
00897 }
00898 else
00899 return false;
00900 }
00901
00902 bool CWeightedDegreePositionStringKernel::set_weights(
00903 float64_t* ws, int32_t d, int32_t len)
00904 {
00905 if (d!=degree || len<0)
00906 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00907
00908 degree=d;
00909 length=len;
00910
00911 if (len <= 0)
00912 len=1;
00913
00914 weights_degree=degree;
00915 weights_length=len+max_mismatch;
00916
00917 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00918 int32_t num_weights=weights_degree*weights_length;
00919 delete[] weights;
00920 weights=new float64_t[num_weights];
00921
00922 for (int32_t i=0; i<degree*len; i++)
00923 weights[i]=ws[i];
00924
00925 return true;
00926 }
00927
00928 bool CWeightedDegreePositionStringKernel::set_position_weights(
00929 float64_t* pws, int32_t len)
00930 {
00931 if (seq_length==0)
00932 seq_length=len;
00933
00934 if (seq_length!=len)
00935 {
00936 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00937 return false;
00938 }
00939 delete[] position_weights;
00940 position_weights=new float64_t[len];
00941 position_weights_len=len;
00942 tries.set_position_weights(position_weights);
00943
00944 if (position_weights)
00945 {
00946 for (int32_t i=0; i<len; i++)
00947 position_weights[i]=pws[i];
00948 return true;
00949 }
00950 else
00951 return false;
00952 }
00953
00954 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num)
00955 {
00956 if (position_weights_rhs==position_weights_lhs)
00957 position_weights_rhs=NULL;
00958 else
00959 delete_position_weights_rhs();
00960
00961 if (len==0)
00962 {
00963 return delete_position_weights_lhs();
00964 }
00965
00966 if (seq_length!=len)
00967 {
00968 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00969 return false;
00970 }
00971
00972 delete[] position_weights_lhs;
00973 position_weights_lhs=new float64_t[len*num];
00974 position_weights_lhs_len=len*num;
00975
00976 for (int32_t i=0; i<len*num; i++)
00977 position_weights_lhs[i]=pws[i];
00978
00979 return true;
00980 }
00981
00982 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(
00983 float64_t* pws, int32_t len, int32_t num)
00984 {
00985 if (len==0)
00986 {
00987 if (position_weights_rhs==position_weights_lhs)
00988 {
00989 position_weights_rhs=NULL;
00990 return true;
00991 }
00992 return delete_position_weights_rhs();
00993 }
00994
00995 if (seq_length!=len)
00996 {
00997 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00998 return false;
00999 }
01000
01001 delete[] position_weights_rhs;
01002 position_weights_rhs=new float64_t[len*num];
01003 position_weights_rhs_len=len*num;
01004
01005 for (int32_t i=0; i<len*num; i++)
01006 position_weights_rhs[i]=pws[i];
01007
01008 return true;
01009 }
01010
01011 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01012 {
01013 delete[] block_weights;
01014 block_weights=new float64_t[CMath::max(seq_length,degree)];
01015
01016 if (block_weights)
01017 {
01018 int32_t k;
01019 float64_t d=degree;
01020
01021 for (k=0; k<degree; k++)
01022 block_weights[k]=
01023 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
01024 for (k=degree; k<seq_length; k++)
01025 block_weights[k]=(-d+3*k+4)/3;
01026 }
01027
01028 return (block_weights!=NULL);
01029 }
01030
01031 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01032 {
01033 ASSERT(weights);
01034 delete[] block_weights;
01035 block_weights=new float64_t[CMath::max(seq_length,degree)];
01036
01037 if (block_weights)
01038 {
01039 int32_t i=0;
01040 block_weights[0]=weights[0];
01041 for (i=1; i<CMath::max(seq_length,degree); i++)
01042 block_weights[i]=0;
01043
01044 for (i=1; i<CMath::max(seq_length,degree); i++)
01045 {
01046 block_weights[i]=block_weights[i-1];
01047
01048 float64_t contrib=0;
01049 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
01050 contrib+=weights[j];
01051
01052 block_weights[i]+=contrib;
01053 }
01054 }
01055
01056 return (block_weights!=NULL);
01057 }
01058
01059 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01060 {
01061 delete[] block_weights;
01062 block_weights=new float64_t[seq_length];
01063
01064 if (block_weights)
01065 {
01066 for (int32_t i=1; i<seq_length+1 ; i++)
01067 block_weights[i-1]=1.0/seq_length;
01068 }
01069
01070 return (block_weights!=NULL);
01071 }
01072
01073 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01074 {
01075 delete[] block_weights;
01076 block_weights=new float64_t[seq_length];
01077
01078 if (block_weights)
01079 {
01080 for (int32_t i=1; i<seq_length+1 ; i++)
01081 block_weights[i-1]=degree*i;
01082 }
01083
01084 return (block_weights!=NULL);
01085 }
01086
01087 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01088 {
01089 delete[] block_weights;
01090 block_weights=new float64_t[seq_length];
01091
01092 if (block_weights)
01093 {
01094 for (int32_t i=1; i<degree+1 ; i++)
01095 block_weights[i-1]=((float64_t) i)*i;
01096
01097 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01098 block_weights[i-1]=i;
01099 }
01100
01101 return (block_weights!=NULL);
01102 }
01103
01104 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01105 {
01106 delete[] block_weights;
01107 block_weights=new float64_t[seq_length];
01108
01109 if (block_weights)
01110 {
01111 for (int32_t i=1; i<degree+1 ; i++)
01112 block_weights[i-1]=((float64_t) i)*i*i;
01113
01114 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01115 block_weights[i-1]=i;
01116 }
01117
01118 return (block_weights!=NULL);
01119 }
01120
01121 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01122 {
01123 delete[] block_weights;
01124 block_weights=new float64_t[seq_length];
01125
01126 if (block_weights)
01127 {
01128 for (int32_t i=1; i<degree+1 ; i++)
01129 block_weights[i-1]=exp(((float64_t) i/10.0));
01130
01131 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01132 block_weights[i-1]=i;
01133 }
01134
01135 return (block_weights!=NULL);
01136 }
01137
01138 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01139 {
01140 delete[] block_weights;
01141 block_weights=new float64_t[seq_length];
01142
01143 if (block_weights)
01144 {
01145 for (int32_t i=1; i<degree+1 ; i++)
01146 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
01147
01148 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01149 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
01150 }
01151
01152 return (block_weights!=NULL);
01153 }
01154
01155 bool CWeightedDegreePositionStringKernel::init_block_weights()
01156 {
01157 switch (type)
01158 {
01159 case E_WD:
01160 return init_block_weights_from_wd();
01161 case E_EXTERNAL:
01162 return init_block_weights_from_wd_external();
01163 case E_BLOCK_CONST:
01164 return init_block_weights_const();
01165 case E_BLOCK_LINEAR:
01166 return init_block_weights_linear();
01167 case E_BLOCK_SQPOLY:
01168 return init_block_weights_sqpoly();
01169 case E_BLOCK_CUBICPOLY:
01170 return init_block_weights_cubicpoly();
01171 case E_BLOCK_EXP:
01172 return init_block_weights_exp();
01173 case E_BLOCK_LOG:
01174 return init_block_weights_log();
01175 };
01176 return false;
01177 }
01178
01179
01180
01181 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01182 {
01183 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01184 int32_t j=params->j;
01185 CWeightedDegreePositionStringKernel* wd=params->kernel;
01186 CTrie<DNATrie>* tries=params->tries;
01187 float64_t* weights=params->weights;
01188 int32_t length=params->length;
01189 int32_t max_shift=params->max_shift;
01190 int32_t* vec=params->vec;
01191 float64_t* result=params->result;
01192 float64_t factor=params->factor;
01193 int32_t* shift=params->shift;
01194 int32_t* vec_idx=params->vec_idx;
01195
01196 for (int32_t i=params->start; i<params->end; i++)
01197 {
01198 int32_t len=0;
01199 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
01200 CAlphabet* alpha=wd->alphabet;
01201
01202 bool free_vec;
01203 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
01204 for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01205 vec[k]=alpha->remap_to_bin(char_vec[k]);
01206 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
01207
01208 SG_UNREF(rhs_feat);
01209
01210 result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
01211
01212 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01213 {
01214 for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01215 {
01216 int32_t s=j-q ;
01217 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01218 {
01219 result[i] +=
01220 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01221 len, q, q+s, q, weights, (length!=0)),
01222 vec_idx[i])/(2.0*s);
01223 }
01224 }
01225
01226 for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
01227 {
01228 result[i] +=
01229 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01230 len, j+s, j, j+s, weights, (length!=0)),
01231 vec_idx[i])/(2.0*s);
01232 }
01233 }
01234 }
01235
01236 return NULL;
01237 }
01238
01239 void CWeightedDegreePositionStringKernel::compute_batch(
01240 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
01241 int32_t* IDX, float64_t* alphas, float64_t factor)
01242 {
01243 ASSERT(alphabet);
01244 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01245 ASSERT(position_weights_lhs==NULL);
01246 ASSERT(position_weights_rhs==NULL);
01247 ASSERT(rhs);
01248 ASSERT(num_vec<=rhs->get_num_vectors());
01249 ASSERT(num_vec>0);
01250 ASSERT(vec_idx);
01251 ASSERT(result);
01252 create_empty_tries();
01253
01254 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01255 ASSERT(num_feat>0);
01256 int32_t num_threads=parallel->get_num_threads();
01257 ASSERT(num_threads>0);
01258 int32_t* vec=new int32_t[num_threads*num_feat];
01259
01260 if (num_threads < 2)
01261 {
01262 #ifdef WIN32
01263 for (int32_t j=0; j<num_feat; j++)
01264 #else
01265 CSignal::clear_cancel();
01266 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01267 #endif
01268 {
01269 init_optimization(num_suppvec, IDX, alphas, j);
01270 S_THREAD_PARAM<DNATrie> params;
01271 params.vec=vec;
01272 params.result=result;
01273 params.weights=weights;
01274 params.kernel=this;
01275 params.tries=&tries;
01276 params.factor=factor;
01277 params.j=j;
01278 params.start=0;
01279 params.end=num_vec;
01280 params.length=length;
01281 params.max_shift=max_shift;
01282 params.shift=shift;
01283 params.vec_idx=vec_idx;
01284 compute_batch_helper((void*) ¶ms);
01285
01286 SG_PROGRESS(j,0,num_feat);
01287 }
01288 }
01289 #ifndef WIN32
01290 else
01291 {
01292
01293 CSignal::clear_cancel();
01294 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01295 {
01296 init_optimization(num_suppvec, IDX, alphas, j);
01297 pthread_t* threads = new pthread_t[num_threads-1];
01298 S_THREAD_PARAM<DNATrie>* params = new S_THREAD_PARAM<DNATrie>[num_threads];
01299 int32_t step= num_vec/num_threads;
01300 int32_t t;
01301
01302 for (t=0; t<num_threads-1; t++)
01303 {
01304 params[t].vec=&vec[num_feat*t];
01305 params[t].result=result;
01306 params[t].weights=weights;
01307 params[t].kernel=this;
01308 params[t].tries=&tries;
01309 params[t].factor=factor;
01310 params[t].j=j;
01311 params[t].start = t*step;
01312 params[t].end = (t+1)*step;
01313 params[t].length=length;
01314 params[t].max_shift=max_shift;
01315 params[t].shift=shift;
01316 params[t].vec_idx=vec_idx;
01317 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01318 }
01319
01320 params[t].vec=&vec[num_feat*t];
01321 params[t].result=result;
01322 params[t].weights=weights;
01323 params[t].kernel=this;
01324 params[t].tries=&tries;
01325 params[t].factor=factor;
01326 params[t].j=j;
01327 params[t].start=t*step;
01328 params[t].end=num_vec;
01329 params[t].length=length;
01330 params[t].max_shift=max_shift;
01331 params[t].shift=shift;
01332 params[t].vec_idx=vec_idx;
01333 compute_batch_helper((void*) ¶ms[t]);
01334
01335 for (t=0; t<num_threads-1; t++)
01336 pthread_join(threads[t], NULL);
01337 SG_PROGRESS(j,0,num_feat);
01338
01339 delete[] params;
01340 delete[] threads;
01341 }
01342 }
01343 #endif
01344
01345 delete[] vec;
01346
01347
01348
01349 create_empty_tries();
01350 }
01351
01352 float64_t* CWeightedDegreePositionStringKernel::compute_scoring(
01353 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
01354 int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01355 {
01356 ASSERT(position_weights_lhs==NULL);
01357 ASSERT(position_weights_rhs==NULL);
01358
01359 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01360 ASSERT(num_feat>0);
01361 ASSERT(alphabet);
01362 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01363
01364 num_sym=4;
01365
01366 ASSERT(max_degree>0);
01367
01368
01369 int32_t* nofsKmers=new int32_t[max_degree];
01370 float64_t** C=new float64_t*[max_degree];
01371 float64_t** L=new float64_t*[max_degree];
01372 float64_t** R=new float64_t*[max_degree];
01373
01374 int32_t i;
01375 int32_t k;
01376
01377
01378 int32_t bigtabSize=0;
01379 for (k=0; k<max_degree; ++k )
01380 {
01381 nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
01382 const int32_t tabSize=nofsKmers[k]*num_feat;
01383 bigtabSize+=tabSize;
01384 }
01385 result=new float64_t[bigtabSize];
01386
01387
01388 int32_t tabOffs=0;
01389 for( k = 0; k < max_degree; ++k )
01390 {
01391 const int32_t tabSize = nofsKmers[k] * num_feat;
01392 C[k] = &result[tabOffs];
01393 L[k] = new float64_t[ tabSize ];
01394 R[k] = new float64_t[ tabSize ];
01395 tabOffs+=tabSize;
01396 for(i = 0; i < tabSize; i++ )
01397 {
01398 C[k][i] = 0.0;
01399 L[k][i] = 0.0;
01400 R[k][i] = 0.0;
01401 }
01402 }
01403
01404
01405 float64_t* margFactors=new float64_t[degree];
01406
01407 int32_t* x = new int32_t[ degree+1 ];
01408 int32_t* substrs = new int32_t[ degree+1 ];
01409
01410 margFactors[0] = 1.0;
01411 substrs[0] = 0;
01412 for( k=1; k < degree; ++k ) {
01413 margFactors[k] = 0.25 * margFactors[k-1];
01414 substrs[k] = -1;
01415 }
01416 substrs[degree] = -1;
01417
01418 struct TreeParseInfo info;
01419 info.num_sym = num_sym;
01420 info.num_feat = num_feat;
01421 info.p = -1;
01422 info.k = -1;
01423 info.nofsKmers = nofsKmers;
01424 info.margFactors = margFactors;
01425 info.x = x;
01426 info.substrs = substrs;
01427 info.y0 = 0;
01428 info.C_k = NULL;
01429 info.L_k = NULL;
01430 info.R_k = NULL;
01431
01432
01433 i = 0;
01434 for( k = 0; k < max_degree; ++k )
01435 {
01436 const int32_t nofKmers = nofsKmers[ k ];
01437 info.C_k = C[k];
01438 info.L_k = L[k];
01439 info.R_k = R[k];
01440
01441
01442 for(int32_t p = 0; p < num_feat; ++p )
01443 {
01444 init_optimization( num_suppvec, IDX, alphas, p );
01445 int32_t tree = p ;
01446 for(int32_t j = 0; j < degree+1; j++ ) {
01447 x[j] = -1;
01448 }
01449 tries.traverse( tree, p, info, 0, x, k );
01450 SG_PROGRESS(i++,0,num_feat*max_degree);
01451 }
01452
01453
01454 if( k > 0 ) {
01455 const int32_t j = k - 1;
01456 const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
01457 for(int32_t p = 0; p < num_feat; ++p ) {
01458 const int32_t offsetJ = nofJmers * p;
01459 const int32_t offsetJ1 = nofJmers * (p+1);
01460 const int32_t offsetK = nofKmers * p;
01461 int32_t y;
01462 int32_t sym;
01463 for( y = 0; y < nofJmers; ++y ) {
01464 for( sym = 0; sym < num_sym; ++sym ) {
01465 const int32_t y_sym = num_sym*y + sym;
01466 const int32_t sym_y = nofJmers*sym + y;
01467 ASSERT(0<=y_sym && y_sym<nofKmers);
01468 ASSERT(0<=sym_y && sym_y<nofKmers);
01469 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01470 if( p < num_feat-1 ) {
01471 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01472 }
01473 }
01474 }
01475 }
01476 }
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488 }
01489
01490
01491 num_feat=1;
01492 num_sym = bigtabSize;
01493
01494 delete[] nofsKmers;
01495 delete[] margFactors;
01496 delete[] substrs;
01497 delete[] x;
01498 delete[] C;
01499 for( k = 0; k < max_degree; ++k ) {
01500 delete[] L[k];
01501 delete[] R[k];
01502 }
01503 delete[] L;
01504 delete[] R;
01505 return result;
01506 }
01507
01508 char* CWeightedDegreePositionStringKernel::compute_consensus(
01509 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01510 {
01511 ASSERT(position_weights_lhs==NULL);
01512 ASSERT(position_weights_rhs==NULL);
01513
01514 ASSERT(degree<=32);
01515 ASSERT(!tries.get_use_compact_terminal_nodes());
01516 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01517 ASSERT(num_feat>0);
01518 ASSERT(alphabet);
01519 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01520
01521
01522 char* result=new char[num_feat];
01523
01524
01525 int32_t num_tables=CMath::max(1,num_feat-degree+1);
01526 DynArray<ConsensusEntry>** table=new DynArray<ConsensusEntry>*[num_tables];
01527
01528 for (int32_t i=0; i<num_tables; i++)
01529 table[i]=new DynArray<ConsensusEntry>(num_suppvec/10);
01530
01531
01532 for (int32_t i=0; i<num_tables; i++)
01533 {
01534 bool cumulative=false;
01535
01536 if (i<num_tables-1)
01537 init_optimization(num_suppvec, IDX, alphas, i);
01538 else
01539 {
01540 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01541 cumulative=true;
01542 }
01543
01544 if (i==0)
01545 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01546 else
01547 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01548
01549 SG_PROGRESS(i,0,num_feat);
01550 }
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574 const char* acgt="ACGT";
01575
01576
01577 int32_t max_idx=-1;
01578 float32_t max_score=0;
01579 int32_t num_elements=table[num_tables-1]->get_num_elements();
01580
01581 for (int32_t i=0; i<num_elements; i++)
01582 {
01583 float64_t sc=table[num_tables-1]->get_element(i).score;
01584 if (sc>max_score || max_idx==-1)
01585 {
01586 max_idx=i;
01587 max_score=sc;
01588 }
01589 }
01590 uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
01591
01592 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01593
01594 for (int32_t i=0; i<degree; i++)
01595 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01596
01597 if (num_tables>1)
01598 {
01599 for (int32_t i=num_tables-1; i>=0; i--)
01600 {
01601
01602 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01603 max_idx=table[i]->get_element(max_idx).bt;
01604 }
01605 }
01606
01607
01608
01609
01610
01611
01612
01613
01614
01615
01616
01617 for (int32_t i=0; i<num_tables; i++)
01618 delete table[i];
01619
01620 delete[] table;
01621 return result;
01622 }
01623
01624
01625 float64_t* CWeightedDegreePositionStringKernel::extract_w(
01626 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01627 float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01628 {
01629 delete_optimization();
01630 use_poim_tries=true;
01631 poim_tries.delete_trees(false);
01632
01633
01634 ASSERT(position_weights_lhs==NULL);
01635 ASSERT(position_weights_rhs==NULL);
01636 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01637 ASSERT(num_feat>0);
01638 ASSERT(alphabet->get_alphabet()==DNA);
01639 ASSERT(max_degree>0);
01640
01641
01642 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01643 const int32_t seqLen = num_feat;
01644 float64_t** subs;
01645 int32_t i;
01646 int32_t k;
01647
01648
01649
01650
01651 int32_t* offsets;
01652 int32_t offset;
01653 offsets = new int32_t[ max_degree ];
01654 offset = 0;
01655 for( k = 0; k < max_degree; ++k ) {
01656 offsets[k] = offset;
01657 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01658 const int32_t tabSize = nofsKmers * seqLen;
01659 offset += tabSize;
01660 }
01661
01662 const int32_t bigTabSize = offset;
01663 w_result=new float64_t[bigTabSize];
01664 for (i=0; i<bigTabSize; ++i)
01665 w_result[i]=0;
01666
01667
01668 subs = new float64_t*[ max_degree ];
01669 ASSERT( subs != NULL );
01670 for( k = 0; k < max_degree; ++k ) {
01671 subs[k] = &w_result[ offsets[k] ];
01672 }
01673 delete[] offsets;
01674
01675
01676 init_optimization( num_suppvec, IDX, alphas, -1);
01677 poim_tries.POIMs_extract_W( subs, max_degree );
01678
01679
01680 delete[] subs;
01681 num_feat = 1;
01682 num_sym = bigTabSize;
01683 use_poim_tries=false;
01684 poim_tries.delete_trees(false);
01685 return w_result;
01686 }
01687
01688 float64_t* CWeightedDegreePositionStringKernel::compute_POIM(
01689 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01690 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
01691 float64_t* alphas, float64_t* distrib )
01692 {
01693 delete_optimization();
01694 use_poim_tries=true;
01695 poim_tries.delete_trees(false);
01696
01697
01698 ASSERT(position_weights_lhs==NULL);
01699 ASSERT(position_weights_rhs==NULL);
01700 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01701 ASSERT(num_feat>0);
01702 ASSERT(alphabet->get_alphabet()==DNA);
01703 ASSERT(max_degree!=0);
01704 ASSERT(distrib);
01705
01706
01707 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01708 const int32_t seqLen = num_feat;
01709 float64_t** subs;
01710 int32_t i;
01711 int32_t k;
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724 const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01725 if( debug ) {
01726 max_degree = abs(max_degree) / 4;
01727 switch( debug ) {
01728 case 1: {
01729 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01730 break;
01731 }
01732 case 2: {
01733 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01734 break;
01735 }
01736 case 3: {
01737 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01738 break;
01739 }
01740 case 4: {
01741 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01742 break;
01743 }
01744 default: {
01745 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01746 ASSERT(0);
01747 break;
01748 }
01749 }
01750 }
01751
01752
01753 int32_t* offsets;
01754 int32_t offset;
01755 offsets = new int32_t[ max_degree ];
01756 offset = 0;
01757 for( k = 0; k < max_degree; ++k ) {
01758 offsets[k] = offset;
01759 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01760 const int32_t tabSize = nofsKmers * seqLen;
01761 offset += tabSize;
01762 }
01763
01764 const int32_t bigTabSize=offset;
01765 poim_result=new float64_t[bigTabSize];
01766 for (i=0; i<bigTabSize; ++i )
01767 poim_result[i]=0;
01768
01769
01770 subs=new float64_t*[max_degree];
01771 for (k=0; k<max_degree; ++k)
01772 subs[k]=&poim_result[offsets[k]];
01773
01774 delete[] offsets;
01775
01776
01777 init_optimization( num_suppvec, IDX, alphas, -1);
01778 poim_tries.POIMs_precalc_SLR( distrib );
01779
01780
01781 if( debug==0 || debug==1 ) {
01782 poim_tries.POIMs_extract_W( subs, max_degree );
01783 for( k = 1; k < max_degree; ++k ) {
01784 const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
01785 const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
01786 const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
01787 for( i = 0; i < seqLen; ++i ) {
01788 float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01789 float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01790 float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01791 float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01792 int32_t y0;
01793 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01794 const int32_t y1l = y0 / NUM_SYMS;
01795 const int32_t y1r = y0 % nofKmers1;
01796 const int32_t y2 = y1r / NUM_SYMS;
01797 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01798 if( i < seqLen-1 ) {
01799 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01800 if( k > 1 ) {
01801 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01802 }
01803 }
01804 }
01805 }
01806 }
01807 }
01808
01809
01810 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01811
01812
01813 delete[] subs;
01814 num_feat = 1;
01815 num_sym = bigTabSize;
01816
01817 use_poim_tries=false;
01818 poim_tries.delete_trees(false);
01819
01820 return poim_result;
01821 }
01822
01823
01824 void CWeightedDegreePositionStringKernel::prepare_POIM2(
01825 float64_t* distrib, int32_t num_sym, int32_t num_feat)
01826 {
01827 free(m_poim_distrib);
01828 m_poim_distrib=(float64_t*)malloc(num_sym*num_feat*sizeof(float64_t));
01829 ASSERT(m_poim_distrib);
01830
01831 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(float64_t));
01832 m_poim_num_sym=num_sym;
01833 m_poim_num_feat=num_feat;
01834 }
01835
01836 void CWeightedDegreePositionStringKernel::compute_POIM2(
01837 int32_t max_degree, CSVM* svm)
01838 {
01839 ASSERT(svm);
01840 int32_t num_suppvec=svm->get_num_support_vectors();
01841 int32_t* sv_idx=new int32_t[num_suppvec];
01842 float64_t* sv_weight=new float64_t[num_suppvec];
01843
01844 for (int32_t i=0; i<num_suppvec; i++)
01845 {
01846 sv_idx[i]=svm->get_support_vector(i);
01847 sv_weight[i]=svm->get_alpha(i);
01848 }
01849
01850 if ((max_degree < 1) || (max_degree > 12))
01851 {
01852
01853 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01854 max_degree=1;
01855 }
01856
01857 int32_t num_feat = m_poim_num_feat;
01858 int32_t num_sym = m_poim_num_sym;
01859 free(m_poim);
01860
01861 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01862 sv_weight, m_poim_distrib);
01863
01864 ASSERT(num_feat==1);
01865 m_poim_result_len=num_sym;
01866
01867 delete[] sv_weight;
01868 delete[] sv_idx;
01869 }
01870
01871 void CWeightedDegreePositionStringKernel::get_POIM2(
01872 float64_t** poim, int32_t* result_len)
01873 {
01874 *poim=(float64_t*) malloc(m_poim_result_len*sizeof(float64_t));
01875 ASSERT(*poim);
01876 memcpy(*poim, m_poim, m_poim_result_len*sizeof(float64_t)) ;
01877 *result_len=m_poim_result_len ;
01878 }
01879
01880 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01881 {
01882 free(m_poim) ;
01883 m_poim=NULL ;
01884 free(m_poim_distrib) ;
01885 m_poim_distrib=NULL ;
01886 m_poim_num_sym=0 ;
01887 m_poim_num_sym=0 ;
01888 m_poim_result_len=0 ;
01889 }
01890
01891 void CWeightedDegreePositionStringKernel::load_serializable_post(void) throw (ShogunException)
01892 {
01893 CKernel::load_serializable_post();
01894
01895 tries=CTrie<DNATrie>(degree);
01896 poim_tries=CTrie<POIMTrie>(degree);
01897
01898 if (weights)
01899 init_block_weights();
01900 }
01901
01902 void CWeightedDegreePositionStringKernel::init()
01903 {
01904 weights=NULL;
01905 position_weights=NULL;
01906 position_weights_len=0;
01907
01908 position_weights_lhs=NULL;
01909 position_weights_lhs_len=0;
01910 position_weights_rhs=NULL;
01911 position_weights_rhs_len=0;
01912
01913 weights_buffer=NULL;
01914 mkl_stepsize=1;
01915 degree=1;
01916 length=0;
01917
01918 max_shift=0;
01919 max_mismatch=0;
01920 seq_length=0;
01921 shift=NULL;
01922 shift_len=0;
01923
01924 block_weights=NULL;
01925 block_computation=true;
01926 type=E_EXTERNAL;
01927 which_degree=-1;
01928 tries=CTrie<DNATrie>(1);
01929 poim_tries=CTrie<POIMTrie>(1);
01930
01931 tree_initialized=false;
01932 use_poim_tries=false;
01933 m_poim_distrib=NULL;
01934
01935 m_poim=NULL;
01936 m_poim_num_sym=0;
01937 m_poim_num_feat=0;
01938 m_poim_result_len=0;
01939
01940 alphabet=NULL;
01941
01942 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01943
01944 set_normalizer(new CSqrtDiagKernelNormalizer());
01945
01946 m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01947 "weights", "WD Kernel weights.");
01948 m_parameters->add_vector(&position_weights, &position_weights_len,
01949 "position_weights",
01950 "Weights per position.");
01951 m_parameters->add_vector(&position_weights_lhs, &position_weights_lhs_len,
01952 "position_weights_lhs",
01953 "Weights per position left hand side.");
01954 m_parameters->add_vector(&position_weights_rhs, &position_weights_rhs_len,
01955 "position_weights_rhs",
01956 "Weights per position right hand side.");
01957 m_parameters->add_vector(&shift, &shift_len,
01958 "shift",
01959 "Shift Vector.");
01960 m_parameters->add(&mkl_stepsize, "mkl_stepsize", "MKL step size.");
01961 m_parameters->add(°ree, "degree", "Order of WD kernel.");
01962 m_parameters->add(&max_mismatch, "max_mismatch",
01963 "Number of allowed mismatches.");
01964 m_parameters->add(&block_computation, "block_computation",
01965 "If block computation shall be used.");
01966 m_parameters->add((machine_int_t*) &type, "type",
01967 "WeightedDegree kernel type.");
01968 m_parameters->add(&which_degree, "which_degree",
01969 "Unqueal -1 if just a single degree is selected.");
01970 m_parameters->add((CSGObject**) &alphabet, "alphabet",
01971 "Alphabet of Features.");
01972 }