00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 #include <shogun/lib/Signal.h>
00015 #include <shogun/lib/Trie.h>
00016 #include <shogun/base/Parallel.h>
00017
00018 #include <shogun/kernel/WeightedDegreePositionStringKernel.h>
00019 #include <shogun/kernel/SqrtDiagKernelNormalizer.h>
00020 #include <shogun/features/Features.h>
00021 #include <shogun/features/StringFeatures.h>
00022
00023 #include <shogun/classifier/svm/SVM.h>
00024
00025 #ifdef HAVE_PTHREAD
00026 #include <pthread.h>
00027 #endif
00028
00029 using namespace shogun;
00030
00031 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00032
00033 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00034 template <class Trie> struct S_THREAD_PARAM
00035 {
00036 int32_t* vec;
00037 float64_t* result;
00038 float64_t* weights;
00039 CWeightedDegreePositionStringKernel* kernel;
00040 CTrie<Trie>* tries;
00041 float64_t factor;
00042 int32_t j;
00043 int32_t start;
00044 int32_t end;
00045 int32_t length;
00046 int32_t max_shift;
00047 int32_t* shift;
00048 int32_t* vec_idx;
00049 };
00050 #endif // DOXYGEN_SHOULD_SKIP_THIS
00051
00052 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00053 void)
00054 : CStringKernel<char>()
00055 {
00056 init();
00057 }
00058
00059 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00060 int32_t size, int32_t d, int32_t mm, int32_t mkls)
00061 : CStringKernel<char>(size)
00062 {
00063 init();
00064
00065 mkl_stepsize=mkls;
00066 degree=d;
00067 max_mismatch=mm;
00068
00069 tries=CTrie<DNATrie>(d);
00070 poim_tries=CTrie<POIMTrie>(d);
00071
00072 set_wd_weights();
00073 ASSERT(weights);
00074 }
00075
00076 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00077 int32_t size, float64_t* w, int32_t d, int32_t mm, int32_t* s, int32_t sl,
00078 int32_t mkls)
00079 : CStringKernel<char>(size)
00080 {
00081 init();
00082
00083 mkl_stepsize=mkls;
00084 degree=d;
00085 max_mismatch=mm;
00086
00087 tries=CTrie<DNATrie>(d);
00088 poim_tries=CTrie<POIMTrie>(d);
00089
00090 weights=SG_MALLOC(float64_t, d*(1+max_mismatch));
00091 weights_degree=degree;
00092 weights_length=(1+max_mismatch);
00093
00094 for (int32_t i=0; i<d*(1+max_mismatch); i++)
00095 weights[i]=w[i];
00096
00097 set_shifts(SGVector<int32_t>(s, sl));
00098 }
00099
00100 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00101 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00102 : CStringKernel<char>()
00103 {
00104 init();
00105
00106 mkl_stepsize=1;
00107 degree=d;
00108
00109 tries=CTrie<DNATrie>(d);
00110 poim_tries=CTrie<POIMTrie>(d);
00111
00112 set_wd_weights();
00113 ASSERT(weights);
00114
00115 init(l, r);
00116 }
00117
00118
00119 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00120 {
00121 cleanup();
00122 cleanup_POIM2();
00123
00124 SG_FREE(shift);
00125 shift=NULL;
00126
00127 SG_FREE(weights);
00128 weights=NULL;
00129 weights_degree=0;
00130 weights_length=0;
00131
00132 SG_FREE(block_weights);
00133 block_weights=NULL;
00134
00135 SG_FREE(position_weights);
00136 position_weights=NULL;
00137
00138 SG_FREE(position_weights_lhs);
00139 position_weights_lhs=NULL;
00140
00141 SG_FREE(position_weights_rhs);
00142 position_weights_rhs=NULL;
00143
00144 SG_FREE(weights_buffer);
00145 weights_buffer=NULL;
00146 }
00147
00148 void CWeightedDegreePositionStringKernel::remove_lhs()
00149 {
00150 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00151 delete_optimization();
00152
00153 tries.destroy();
00154 poim_tries.destroy();
00155
00156 CKernel::remove_lhs();
00157 }
00158
00159 void CWeightedDegreePositionStringKernel::create_empty_tries()
00160 {
00161 ASSERT(lhs);
00162 seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
00163
00164 if (opt_type==SLOWBUTMEMEFFICIENT)
00165 {
00166 tries.create(seq_length, true);
00167 poim_tries.create(seq_length, true);
00168 }
00169 else if (opt_type==FASTBUTMEMHUNGRY)
00170 {
00171 tries.create(seq_length, false);
00172 poim_tries.create(seq_length, false);
00173 }
00174 else
00175 SG_ERROR( "unknown optimization type\n");
00176 }
00177
00178 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00179 {
00180 int32_t lhs_changed = (lhs!=l) ;
00181 int32_t rhs_changed = (rhs!=r) ;
00182
00183 CStringKernel<char>::init(l,r);
00184
00185 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00186 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00187
00188 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00189 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00190
00191
00192 if (shift_len==0) {
00193 shift_len=sf_l->get_vector_length(0);
00194 int32_t *shifts=SG_MALLOC(int32_t, shift_len);
00195 for (int32_t i=0; i<shift_len; i++) {
00196 shifts[i]=1;
00197 }
00198 set_shifts(SGVector<int32_t>(shifts, shift_len));
00199 SG_FREE(shifts);
00200 }
00201
00202
00203 int32_t len=sf_l->get_max_vector_length();
00204 if (lhs_changed && !sf_l->have_same_length(len))
00205 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00206
00207 if (rhs_changed && !sf_r->have_same_length(len))
00208 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00209
00210 SG_UNREF(alphabet);
00211 alphabet= sf_l->get_alphabet();
00212 CAlphabet* ralphabet=sf_r->get_alphabet();
00213
00214 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00215 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00216
00217 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00218 SG_UNREF(ralphabet);
00219
00220
00221 create_empty_tries();
00222 init_block_weights();
00223
00224 return init_normalizer();
00225 }
00226
00227 void CWeightedDegreePositionStringKernel::cleanup()
00228 {
00229 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00230 delete_optimization();
00231
00232 SG_FREE(block_weights);
00233 block_weights=NULL;
00234
00235 tries.destroy();
00236 poim_tries.destroy();
00237
00238 seq_length = 0;
00239 tree_initialized = false;
00240
00241 SG_UNREF(alphabet);
00242 alphabet=NULL;
00243
00244 CKernel::cleanup();
00245 }
00246
00247 bool CWeightedDegreePositionStringKernel::init_optimization(
00248 int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
00249 int32_t upto_tree)
00250 {
00251 ASSERT(position_weights_lhs==NULL);
00252 ASSERT(position_weights_rhs==NULL);
00253
00254 if (upto_tree<0)
00255 upto_tree=tree_num;
00256
00257 if (max_mismatch!=0)
00258 {
00259 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00260 return false ;
00261 }
00262
00263 if (tree_num<0)
00264 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00265
00266 delete_optimization();
00267
00268 if (tree_num<0)
00269 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00270
00271 for (int32_t i=0; i<p_count; i++)
00272 {
00273 if (tree_num<0)
00274 {
00275 if ( (i % (p_count/10+1)) == 0)
00276 SG_PROGRESS(i,0,p_count);
00277 add_example_to_tree(IDX[i], alphas[i]);
00278 }
00279 else
00280 {
00281 for (int32_t t=tree_num; t<=upto_tree; t++)
00282 add_example_to_single_tree(IDX[i], alphas[i], t);
00283 }
00284 }
00285
00286 if (tree_num<0)
00287 SG_DONE();
00288
00289 set_is_initialized(true) ;
00290 return true ;
00291 }
00292
00293 bool CWeightedDegreePositionStringKernel::delete_optimization()
00294 {
00295 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00296 {
00297 tries.set_use_compact_terminal_nodes(false) ;
00298 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00299 }
00300
00301 if (get_is_initialized())
00302 {
00303 if (opt_type==SLOWBUTMEMEFFICIENT)
00304 tries.delete_trees(true);
00305 else if (opt_type==FASTBUTMEMHUNGRY)
00306 tries.delete_trees(false);
00307 else {
00308 SG_ERROR( "unknown optimization type\n");
00309 }
00310 set_is_initialized(false);
00311
00312 return true;
00313 }
00314
00315 return false;
00316 }
00317
00318 float64_t CWeightedDegreePositionStringKernel::compute_with_mismatch(
00319 char* avec, int32_t alen, char* bvec, int32_t blen)
00320 {
00321 float64_t* max_shift_vec= SG_MALLOC(float64_t, max_shift);
00322 float64_t sum0=0 ;
00323 for (int32_t i=0; i<max_shift; i++)
00324 max_shift_vec[i]=0 ;
00325
00326
00327 for (int32_t i=0; i<alen; i++)
00328 {
00329 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00330 continue ;
00331
00332 int32_t mismatches=0;
00333 float64_t sumi = 0.0 ;
00334 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00335 {
00336 if (avec[i+j]!=bvec[i+j])
00337 {
00338 mismatches++ ;
00339 if (mismatches>max_mismatch)
00340 break ;
00341 } ;
00342 sumi += weights[j+degree*mismatches];
00343 }
00344 if (position_weights!=NULL)
00345 sum0 += position_weights[i]*sumi ;
00346 else
00347 sum0 += sumi ;
00348 } ;
00349
00350 for (int32_t i=0; i<alen; i++)
00351 {
00352 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00353 {
00354 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00355 continue ;
00356
00357 float64_t sumi1 = 0.0 ;
00358
00359 int32_t mismatches=0;
00360 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00361 {
00362 if (avec[i+j+k]!=bvec[i+j])
00363 {
00364 mismatches++ ;
00365 if (mismatches>max_mismatch)
00366 break ;
00367 } ;
00368 sumi1 += weights[j+degree*mismatches];
00369 }
00370 float64_t sumi2 = 0.0 ;
00371
00372 mismatches=0;
00373 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00374 {
00375 if (avec[i+j]!=bvec[i+j+k])
00376 {
00377 mismatches++ ;
00378 if (mismatches>max_mismatch)
00379 break ;
00380 } ;
00381 sumi2 += weights[j+degree*mismatches];
00382 }
00383 if (position_weights!=NULL)
00384 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00385 else
00386 max_shift_vec[k-1] += sumi1 + sumi2 ;
00387 } ;
00388 }
00389
00390 float64_t result = sum0 ;
00391 for (int32_t i=0; i<max_shift; i++)
00392 result += max_shift_vec[i]/(2*(i+1)) ;
00393
00394 SG_FREE(max_shift_vec);
00395 return result ;
00396 }
00397
00398 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch(
00399 char* avec, int32_t alen, char* bvec, int32_t blen)
00400 {
00401 float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
00402 float64_t sum0=0 ;
00403 for (int32_t i=0; i<max_shift; i++)
00404 max_shift_vec[i]=0 ;
00405
00406
00407 for (int32_t i=0; i<alen; i++)
00408 {
00409 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00410 continue ;
00411
00412 float64_t sumi = 0.0 ;
00413 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00414 {
00415 if (avec[i+j]!=bvec[i+j])
00416 break ;
00417 sumi += weights[j];
00418 }
00419 if (position_weights!=NULL)
00420 sum0 += position_weights[i]*sumi ;
00421 else
00422 sum0 += sumi ;
00423 } ;
00424
00425 for (int32_t i=0; i<alen; i++)
00426 {
00427 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00428 {
00429 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00430 continue ;
00431
00432 float64_t sumi1 = 0.0 ;
00433
00434 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00435 {
00436 if (avec[i+j+k]!=bvec[i+j])
00437 break ;
00438 sumi1 += weights[j];
00439 }
00440 float64_t sumi2 = 0.0 ;
00441
00442 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00443 {
00444 if (avec[i+j]!=bvec[i+j+k])
00445 break ;
00446 sumi2 += weights[j];
00447 }
00448 if (position_weights!=NULL)
00449 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00450 else
00451 max_shift_vec[k-1] += sumi1 + sumi2 ;
00452 } ;
00453 }
00454
00455 float64_t result = sum0 ;
00456 for (int32_t i=0; i<max_shift; i++)
00457 result += max_shift_vec[i]/(2*(i+1)) ;
00458
00459 SG_FREE(max_shift_vec);
00460
00461 return result ;
00462 }
00463
00464 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(
00465 char* avec, int32_t alen, char* bvec, int32_t blen)
00466 {
00467 float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
00468 float64_t sum0=0 ;
00469 for (int32_t i=0; i<max_shift; i++)
00470 max_shift_vec[i]=0 ;
00471
00472
00473 for (int32_t i=0; i<alen; i++)
00474 {
00475 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00476 continue ;
00477 float64_t sumi = 0.0 ;
00478 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00479 {
00480 if (avec[i+j]!=bvec[i+j])
00481 break ;
00482 sumi += weights[i*degree+j];
00483 }
00484 if (position_weights!=NULL)
00485 sum0 += position_weights[i]*sumi ;
00486 else
00487 sum0 += sumi ;
00488 } ;
00489
00490 for (int32_t i=0; i<alen; i++)
00491 {
00492 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00493 {
00494 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00495 continue ;
00496
00497 float64_t sumi1 = 0.0 ;
00498
00499 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00500 {
00501 if (avec[i+j+k]!=bvec[i+j])
00502 break ;
00503 sumi1 += weights[i*degree+j];
00504 }
00505 float64_t sumi2 = 0.0 ;
00506
00507 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00508 {
00509 if (avec[i+j]!=bvec[i+j+k])
00510 break ;
00511 sumi2 += weights[i*degree+j];
00512 }
00513 if (position_weights!=NULL)
00514 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00515 else
00516 max_shift_vec[k-1] += sumi1 + sumi2 ;
00517 } ;
00518 }
00519
00520 float64_t result = sum0 ;
00521 for (int32_t i=0; i<max_shift; i++)
00522 result += max_shift_vec[i]/(2*(i+1)) ;
00523
00524 SG_FREE(max_shift_vec);
00525 return result ;
00526 }
00527
00528 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(
00529 char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
00530 float64_t* pos_weights_rhs, int32_t blen)
00531 {
00532 float64_t* max_shift_vec = SG_MALLOC(float64_t, max_shift);
00533 float64_t sum0=0 ;
00534 for (int32_t i=0; i<max_shift; i++)
00535 max_shift_vec[i]=0 ;
00536
00537
00538 for (int32_t i=0; i<alen; i++)
00539 {
00540 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00541 continue ;
00542
00543 float64_t sumi = 0.0 ;
00544 float64_t posweight_lhs = 0.0 ;
00545 float64_t posweight_rhs = 0.0 ;
00546 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00547 {
00548 posweight_lhs += pos_weights_lhs[i+j] ;
00549 posweight_rhs += pos_weights_rhs[i+j] ;
00550
00551 if (avec[i+j]!=bvec[i+j])
00552 break ;
00553 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00554 }
00555 if (position_weights!=NULL)
00556 sum0 += position_weights[i]*sumi ;
00557 else
00558 sum0 += sumi ;
00559 } ;
00560
00561 for (int32_t i=0; i<alen; i++)
00562 {
00563 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00564 {
00565 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00566 continue ;
00567
00568
00569 float64_t sumi1 = 0.0 ;
00570 float64_t posweight_lhs = 0.0 ;
00571 float64_t posweight_rhs = 0.0 ;
00572 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00573 {
00574 posweight_lhs += pos_weights_lhs[i+j+k] ;
00575 posweight_rhs += pos_weights_rhs[i+j] ;
00576 if (avec[i+j+k]!=bvec[i+j])
00577 break ;
00578 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00579 }
00580
00581 float64_t sumi2 = 0.0 ;
00582 posweight_lhs = 0.0 ;
00583 posweight_rhs = 0.0 ;
00584 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00585 {
00586 posweight_lhs += pos_weights_lhs[i+j] ;
00587 posweight_rhs += pos_weights_rhs[i+j+k] ;
00588 if (avec[i+j]!=bvec[i+j+k])
00589 break ;
00590 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00591 }
00592 if (position_weights!=NULL)
00593 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00594 else
00595 max_shift_vec[k-1] += sumi1 + sumi2 ;
00596 } ;
00597 }
00598
00599 float64_t result = sum0 ;
00600 for (int32_t i=0; i<max_shift; i++)
00601 result += max_shift_vec[i]/(2*(i+1)) ;
00602
00603 SG_FREE(max_shift_vec);
00604 return result ;
00605 }
00606
00607
00608 float64_t CWeightedDegreePositionStringKernel::compute(
00609 int32_t idx_a, int32_t idx_b)
00610 {
00611 int32_t alen, blen;
00612 bool free_avec, free_bvec;
00613
00614 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00615 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00616
00617 ASSERT(alen==blen);
00618 ASSERT(shift_len==alen);
00619
00620 float64_t result = 0 ;
00621 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00622 {
00623 ASSERT(max_mismatch==0);
00624 float64_t* position_weights_rhs_ = position_weights_rhs ;
00625 if (lhs==rhs)
00626 position_weights_rhs_ = position_weights_lhs ;
00627
00628 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
00629 }
00630 else if (max_mismatch > 0)
00631 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00632 else if (length==0)
00633 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00634 else
00635 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00636
00637 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00638 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00639
00640 return result ;
00641 }
00642
00643
00644 void CWeightedDegreePositionStringKernel::add_example_to_tree(
00645 int32_t idx, float64_t alpha)
00646 {
00647 ASSERT(position_weights_lhs==NULL);
00648 ASSERT(position_weights_rhs==NULL);
00649 ASSERT(alphabet);
00650 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00651
00652 int32_t len=0;
00653 bool free_vec;
00654 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00655 ASSERT(max_mismatch==0);
00656 int32_t *vec = SG_MALLOC(int32_t, len);
00657
00658 for (int32_t i=0; i<len; i++)
00659 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00660 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00661
00662 if (opt_type==FASTBUTMEMHUNGRY)
00663 {
00664
00665 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00666 }
00667
00668 for (int32_t i=0; i<len; i++)
00669 {
00670 int32_t max_s=-1;
00671
00672 if (opt_type==SLOWBUTMEMEFFICIENT)
00673 max_s=0;
00674 else if (opt_type==FASTBUTMEMHUNGRY)
00675 max_s=shift[i];
00676 else {
00677 SG_ERROR( "unknown optimization type\n");
00678 }
00679
00680 for (int32_t s=max_s; s>=0; s--)
00681 {
00682 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00683 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00684 if ((s==0) || (i+s>=len))
00685 continue;
00686
00687 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00688 }
00689 }
00690
00691 SG_FREE(vec);
00692 tree_initialized=true ;
00693 }
00694
00695 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(
00696 int32_t idx, float64_t alpha, int32_t tree_num)
00697 {
00698 ASSERT(position_weights_lhs==NULL);
00699 ASSERT(position_weights_rhs==NULL);
00700 ASSERT(alphabet);
00701 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00702
00703 int32_t len=0;
00704 bool free_vec;
00705 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00706 ASSERT(max_mismatch==0);
00707 int32_t *vec=SG_MALLOC(int32_t, len);
00708 int32_t max_s=-1;
00709
00710 if (opt_type==SLOWBUTMEMEFFICIENT)
00711 max_s=0;
00712 else if (opt_type==FASTBUTMEMHUNGRY)
00713 {
00714 ASSERT(!tries.get_use_compact_terminal_nodes());
00715 max_s=shift[tree_num];
00716 }
00717 else {
00718 SG_ERROR( "unknown optimization type\n");
00719 }
00720 for (int32_t i=CMath::max(0,tree_num-max_shift);
00721 i<CMath::min(len,tree_num+degree+max_shift); i++)
00722 {
00723 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00724 }
00725 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00726
00727 for (int32_t s=max_s; s>=0; s--)
00728 {
00729 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00730 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00731 }
00732
00733 if (opt_type==FASTBUTMEMHUNGRY)
00734 {
00735 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00736 {
00737 int32_t s=tree_num-i;
00738 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00739 {
00740 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00741 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00742 }
00743 }
00744 }
00745 SG_FREE(vec);
00746 tree_initialized=true ;
00747 }
00748
00749 float64_t CWeightedDegreePositionStringKernel::compute_by_tree(int32_t idx)
00750 {
00751 ASSERT(position_weights_lhs==NULL);
00752 ASSERT(position_weights_rhs==NULL);
00753 ASSERT(alphabet);
00754 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00755
00756 float64_t sum=0;
00757 int32_t len=0;
00758 bool free_vec;
00759 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00760 ASSERT(max_mismatch==0);
00761 int32_t *vec=SG_MALLOC(int32_t, len);
00762
00763 for (int32_t i=0; i<len; i++)
00764 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00765
00766 ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
00767
00768 for (int32_t i=0; i<len; i++)
00769 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00770
00771 if (opt_type==SLOWBUTMEMEFFICIENT)
00772 {
00773 for (int32_t i=0; i<len; i++)
00774 {
00775 for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
00776 {
00777 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00778 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00779 }
00780 }
00781 }
00782
00783 SG_FREE(vec);
00784
00785 return normalizer->normalize_rhs(sum, idx);
00786 }
00787
00788 void CWeightedDegreePositionStringKernel::compute_by_tree(
00789 int32_t idx, float64_t* LevelContrib)
00790 {
00791 ASSERT(position_weights_lhs==NULL);
00792 ASSERT(position_weights_rhs==NULL);
00793 ASSERT(alphabet);
00794 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00795
00796 int32_t len=0;
00797 bool free_vec;
00798 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00799 ASSERT(max_mismatch==0);
00800 int32_t *vec=SG_MALLOC(int32_t, len);
00801
00802 for (int32_t i=0; i<len; i++)
00803 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00804
00805 ((CStringFeatures<char>*) rhs)->free_feature_vector(char_vec, idx, free_vec);
00806
00807 for (int32_t i=0; i<len; i++)
00808 {
00809 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00810 normalizer->normalize_rhs(1.0, idx), mkl_stepsize, weights,
00811 (length!=0));
00812 }
00813
00814 if (opt_type==SLOWBUTMEMEFFICIENT)
00815 {
00816 for (int32_t i=0; i<len; i++)
00817 for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
00818 {
00819 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
00820 normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
00821 weights, (length!=0)) ;
00822 tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
00823 LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
00824 mkl_stepsize, weights, (length!=0)) ;
00825 }
00826 }
00827
00828 SG_FREE(vec);
00829 }
00830
00831 float64_t* CWeightedDegreePositionStringKernel::compute_abs_weights(
00832 int32_t &len)
00833 {
00834 return tries.compute_abs_weights(len);
00835 }
00836
00837 void CWeightedDegreePositionStringKernel::set_shifts(SGVector<int32_t> shifts)
00838 {
00839 SG_FREE(shift);
00840
00841 shift_len = shifts.vlen;
00842 shift = SG_MALLOC(int32_t, shift_len);
00843
00844 if (shift)
00845 {
00846 max_shift = 0 ;
00847
00848 for (int32_t i=0; i<shift_len; i++)
00849 {
00850 shift[i] = shifts.vector[i] ;
00851 max_shift = CMath::max(shift[i], max_shift);
00852 }
00853
00854 ASSERT(max_shift>=0 && max_shift<=shift_len);
00855 }
00856 }
00857
00858 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00859 {
00860 ASSERT(degree>0);
00861
00862 SG_FREE(weights);
00863 weights=SG_MALLOC(float64_t, degree);
00864 weights_degree=degree;
00865 weights_length=1;
00866
00867 if (weights)
00868 {
00869 int32_t i;
00870 float64_t sum=0;
00871 for (i=0; i<degree; i++)
00872 {
00873 weights[i]=degree-i;
00874 sum+=weights[i];
00875 }
00876 for (i=0; i<degree; i++)
00877 weights[i]/=sum;
00878
00879 for (i=0; i<degree; i++)
00880 {
00881 for (int32_t j=1; j<=max_mismatch; j++)
00882 {
00883 if (j<i+1)
00884 {
00885 int32_t nk=CMath::nchoosek(i+1, j);
00886 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00887 }
00888 else
00889 weights[i+j*degree]= 0;
00890 }
00891 }
00892
00893 return true;
00894 }
00895 else
00896 return false;
00897 }
00898
00899 bool CWeightedDegreePositionStringKernel::set_weights(SGMatrix<float64_t> new_weights)
00900 {
00901 float64_t* ws=new_weights.matrix;
00902 int32_t d=new_weights.num_rows;
00903 int32_t len=new_weights.num_cols;
00904
00905 if (d!=degree || len<0)
00906 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00907
00908 degree=d;
00909 length=len;
00910
00911 if (len <= 0)
00912 len=1;
00913
00914 weights_degree=degree;
00915 weights_length=len+max_mismatch;
00916
00917 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00918 int32_t num_weights=weights_degree*weights_length;
00919 SG_FREE(weights);
00920 weights=SG_MALLOC(float64_t, num_weights);
00921
00922 for (int32_t i=0; i<degree*len; i++)
00923 weights[i]=ws[i];
00924
00925 return true;
00926 }
00927
00928 void CWeightedDegreePositionStringKernel::set_position_weights(SGVector<float64_t> pws)
00929 {
00930 if (seq_length==0)
00931 seq_length=pws.vlen;
00932
00933 if (seq_length!=pws.vlen)
00934 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, pws.vlen);
00935
00936 SG_FREE(position_weights);
00937 position_weights=SG_MALLOC(float64_t, pws.vlen);
00938 position_weights_len=pws.vlen;
00939 tries.set_position_weights(position_weights);
00940
00941 for (int32_t i=0; i<pws.vlen; i++)
00942 position_weights[i]=pws.vector[i];
00943 }
00944
00945 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num)
00946 {
00947 if (position_weights_rhs==position_weights_lhs)
00948 position_weights_rhs=NULL;
00949 else
00950 delete_position_weights_rhs();
00951
00952 if (len==0)
00953 {
00954 return delete_position_weights_lhs();
00955 }
00956
00957 if (seq_length!=len)
00958 {
00959 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00960 return false;
00961 }
00962
00963 SG_FREE(position_weights_lhs);
00964 position_weights_lhs=SG_MALLOC(float64_t, len*num);
00965 position_weights_lhs_len=len*num;
00966
00967 for (int32_t i=0; i<len*num; i++)
00968 position_weights_lhs[i]=pws[i];
00969
00970 return true;
00971 }
00972
00973 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(
00974 float64_t* pws, int32_t len, int32_t num)
00975 {
00976 if (len==0)
00977 {
00978 if (position_weights_rhs==position_weights_lhs)
00979 {
00980 position_weights_rhs=NULL;
00981 return true;
00982 }
00983 return delete_position_weights_rhs();
00984 }
00985
00986 if (seq_length!=len)
00987 {
00988 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00989 return false;
00990 }
00991
00992 SG_FREE(position_weights_rhs);
00993 position_weights_rhs=SG_MALLOC(float64_t, len*num);
00994 position_weights_rhs_len=len*num;
00995
00996 for (int32_t i=0; i<len*num; i++)
00997 position_weights_rhs[i]=pws[i];
00998
00999 return true;
01000 }
01001
01002 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01003 {
01004 SG_FREE(block_weights);
01005 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
01006
01007 if (block_weights)
01008 {
01009 int32_t k;
01010 float64_t d=degree;
01011
01012 for (k=0; k<degree; k++)
01013 block_weights[k]=
01014 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
01015 for (k=degree; k<seq_length; k++)
01016 block_weights[k]=(-d+3*k+4)/3;
01017 }
01018
01019 return (block_weights!=NULL);
01020 }
01021
01022 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01023 {
01024 ASSERT(weights);
01025 SG_FREE(block_weights);
01026 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
01027
01028 if (block_weights)
01029 {
01030 int32_t i=0;
01031 block_weights[0]=weights[0];
01032 for (i=1; i<CMath::max(seq_length,degree); i++)
01033 block_weights[i]=0;
01034
01035 for (i=1; i<CMath::max(seq_length,degree); i++)
01036 {
01037 block_weights[i]=block_weights[i-1];
01038
01039 float64_t contrib=0;
01040 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
01041 contrib+=weights[j];
01042
01043 block_weights[i]+=contrib;
01044 }
01045 }
01046
01047 return (block_weights!=NULL);
01048 }
01049
01050 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01051 {
01052 SG_FREE(block_weights);
01053 block_weights=SG_MALLOC(float64_t, seq_length);
01054
01055 if (block_weights)
01056 {
01057 for (int32_t i=1; i<seq_length+1 ; i++)
01058 block_weights[i-1]=1.0/seq_length;
01059 }
01060
01061 return (block_weights!=NULL);
01062 }
01063
01064 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01065 {
01066 SG_FREE(block_weights);
01067 block_weights=SG_MALLOC(float64_t, seq_length);
01068
01069 if (block_weights)
01070 {
01071 for (int32_t i=1; i<seq_length+1 ; i++)
01072 block_weights[i-1]=degree*i;
01073 }
01074
01075 return (block_weights!=NULL);
01076 }
01077
01078 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01079 {
01080 SG_FREE(block_weights);
01081 block_weights=SG_MALLOC(float64_t, seq_length);
01082
01083 if (block_weights)
01084 {
01085 for (int32_t i=1; i<degree+1 ; i++)
01086 block_weights[i-1]=((float64_t) i)*i;
01087
01088 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01089 block_weights[i-1]=i;
01090 }
01091
01092 return (block_weights!=NULL);
01093 }
01094
01095 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01096 {
01097 SG_FREE(block_weights);
01098 block_weights=SG_MALLOC(float64_t, seq_length);
01099
01100 if (block_weights)
01101 {
01102 for (int32_t i=1; i<degree+1 ; i++)
01103 block_weights[i-1]=((float64_t) i)*i*i;
01104
01105 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01106 block_weights[i-1]=i;
01107 }
01108
01109 return (block_weights!=NULL);
01110 }
01111
01112 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01113 {
01114 SG_FREE(block_weights);
01115 block_weights=SG_MALLOC(float64_t, seq_length);
01116
01117 if (block_weights)
01118 {
01119 for (int32_t i=1; i<degree+1 ; i++)
01120 block_weights[i-1]=exp(((float64_t) i/10.0));
01121
01122 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01123 block_weights[i-1]=i;
01124 }
01125
01126 return (block_weights!=NULL);
01127 }
01128
01129 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01130 {
01131 SG_FREE(block_weights);
01132 block_weights=SG_MALLOC(float64_t, seq_length);
01133
01134 if (block_weights)
01135 {
01136 for (int32_t i=1; i<degree+1 ; i++)
01137 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
01138
01139 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01140 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
01141 }
01142
01143 return (block_weights!=NULL);
01144 }
01145
01146 bool CWeightedDegreePositionStringKernel::init_block_weights()
01147 {
01148 switch (type)
01149 {
01150 case E_WD:
01151 return init_block_weights_from_wd();
01152 case E_EXTERNAL:
01153 return init_block_weights_from_wd_external();
01154 case E_BLOCK_CONST:
01155 return init_block_weights_const();
01156 case E_BLOCK_LINEAR:
01157 return init_block_weights_linear();
01158 case E_BLOCK_SQPOLY:
01159 return init_block_weights_sqpoly();
01160 case E_BLOCK_CUBICPOLY:
01161 return init_block_weights_cubicpoly();
01162 case E_BLOCK_EXP:
01163 return init_block_weights_exp();
01164 case E_BLOCK_LOG:
01165 return init_block_weights_log();
01166 };
01167 return false;
01168 }
01169
01170
01171
01172 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01173 {
01174 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01175 int32_t j=params->j;
01176 CWeightedDegreePositionStringKernel* wd=params->kernel;
01177 CTrie<DNATrie>* tries=params->tries;
01178 float64_t* weights=params->weights;
01179 int32_t length=params->length;
01180 int32_t max_shift=params->max_shift;
01181 int32_t* vec=params->vec;
01182 float64_t* result=params->result;
01183 float64_t factor=params->factor;
01184 int32_t* shift=params->shift;
01185 int32_t* vec_idx=params->vec_idx;
01186
01187 for (int32_t i=params->start; i<params->end; i++)
01188 {
01189 int32_t len=0;
01190 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
01191 CAlphabet* alpha=wd->alphabet;
01192
01193 bool free_vec;
01194 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
01195 for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01196 vec[k]=alpha->remap_to_bin(char_vec[k]);
01197 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
01198
01199 SG_UNREF(rhs_feat);
01200
01201 result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
01202
01203 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01204 {
01205 for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01206 {
01207 int32_t s=j-q ;
01208 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01209 {
01210 result[i] +=
01211 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01212 len, q, q+s, q, weights, (length!=0)),
01213 vec_idx[i])/(2.0*s);
01214 }
01215 }
01216
01217 for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
01218 {
01219 result[i] +=
01220 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01221 len, j+s, j, j+s, weights, (length!=0)),
01222 vec_idx[i])/(2.0*s);
01223 }
01224 }
01225 }
01226
01227 return NULL;
01228 }
01229
01230 void CWeightedDegreePositionStringKernel::compute_batch(
01231 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
01232 int32_t* IDX, float64_t* alphas, float64_t factor)
01233 {
01234 ASSERT(alphabet);
01235 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01236 ASSERT(position_weights_lhs==NULL);
01237 ASSERT(position_weights_rhs==NULL);
01238 ASSERT(rhs);
01239 ASSERT(num_vec<=rhs->get_num_vectors());
01240 ASSERT(num_vec>0);
01241 ASSERT(vec_idx);
01242 ASSERT(result);
01243 create_empty_tries();
01244
01245 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01246 ASSERT(num_feat>0);
01247 int32_t num_threads=parallel->get_num_threads();
01248 ASSERT(num_threads>0);
01249 int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
01250
01251 if (num_threads < 2)
01252 {
01253 #ifdef WIN32
01254 for (int32_t j=0; j<num_feat; j++)
01255 #else
01256 CSignal::clear_cancel();
01257 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01258 #endif
01259 {
01260 init_optimization(num_suppvec, IDX, alphas, j);
01261 S_THREAD_PARAM<DNATrie> params;
01262 params.vec=vec;
01263 params.result=result;
01264 params.weights=weights;
01265 params.kernel=this;
01266 params.tries=&tries;
01267 params.factor=factor;
01268 params.j=j;
01269 params.start=0;
01270 params.end=num_vec;
01271 params.length=length;
01272 params.max_shift=max_shift;
01273 params.shift=shift;
01274 params.vec_idx=vec_idx;
01275 compute_batch_helper((void*) ¶ms);
01276
01277 SG_PROGRESS(j,0,num_feat);
01278 }
01279 }
01280 #ifdef HAVE_PTHREAD
01281 else
01282 {
01283
01284 CSignal::clear_cancel();
01285 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01286 {
01287 init_optimization(num_suppvec, IDX, alphas, j);
01288 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
01289 S_THREAD_PARAM<DNATrie>* params = SG_MALLOC(S_THREAD_PARAM<DNATrie>, num_threads);
01290 int32_t step= num_vec/num_threads;
01291 int32_t t;
01292
01293 for (t=0; t<num_threads-1; t++)
01294 {
01295 params[t].vec=&vec[num_feat*t];
01296 params[t].result=result;
01297 params[t].weights=weights;
01298 params[t].kernel=this;
01299 params[t].tries=&tries;
01300 params[t].factor=factor;
01301 params[t].j=j;
01302 params[t].start = t*step;
01303 params[t].end = (t+1)*step;
01304 params[t].length=length;
01305 params[t].max_shift=max_shift;
01306 params[t].shift=shift;
01307 params[t].vec_idx=vec_idx;
01308 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01309 }
01310
01311 params[t].vec=&vec[num_feat*t];
01312 params[t].result=result;
01313 params[t].weights=weights;
01314 params[t].kernel=this;
01315 params[t].tries=&tries;
01316 params[t].factor=factor;
01317 params[t].j=j;
01318 params[t].start=t*step;
01319 params[t].end=num_vec;
01320 params[t].length=length;
01321 params[t].max_shift=max_shift;
01322 params[t].shift=shift;
01323 params[t].vec_idx=vec_idx;
01324 compute_batch_helper((void*) ¶ms[t]);
01325
01326 for (t=0; t<num_threads-1; t++)
01327 pthread_join(threads[t], NULL);
01328 SG_PROGRESS(j,0,num_feat);
01329
01330 SG_FREE(params);
01331 SG_FREE(threads);
01332 }
01333 }
01334 #endif
01335
01336 SG_FREE(vec);
01337
01338
01339
01340 create_empty_tries();
01341 }
01342
01343 float64_t* CWeightedDegreePositionStringKernel::compute_scoring(
01344 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
01345 int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01346 {
01347 ASSERT(position_weights_lhs==NULL);
01348 ASSERT(position_weights_rhs==NULL);
01349
01350 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01351 ASSERT(num_feat>0);
01352 ASSERT(alphabet);
01353 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01354
01355 num_sym=4;
01356
01357 ASSERT(max_degree>0);
01358
01359
01360 int32_t* nofsKmers=SG_MALLOC(int32_t, max_degree);
01361 float64_t** C=SG_MALLOC(float64_t*, max_degree);
01362 float64_t** L=SG_MALLOC(float64_t*, max_degree);
01363 float64_t** R=SG_MALLOC(float64_t*, max_degree);
01364
01365 int32_t i;
01366 int32_t k;
01367
01368
01369 int32_t bigtabSize=0;
01370 for (k=0; k<max_degree; ++k )
01371 {
01372 nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
01373 const int32_t tabSize=nofsKmers[k]*num_feat;
01374 bigtabSize+=tabSize;
01375 }
01376 result=SG_MALLOC(float64_t, bigtabSize);
01377
01378
01379 int32_t tabOffs=0;
01380 for( k = 0; k < max_degree; ++k )
01381 {
01382 const int32_t tabSize = nofsKmers[k] * num_feat;
01383 C[k] = &result[tabOffs];
01384 L[k] = SG_MALLOC(float64_t, tabSize );
01385 R[k] = SG_MALLOC(float64_t, tabSize );
01386 tabOffs+=tabSize;
01387 for(i = 0; i < tabSize; i++ )
01388 {
01389 C[k][i] = 0.0;
01390 L[k][i] = 0.0;
01391 R[k][i] = 0.0;
01392 }
01393 }
01394
01395
01396 float64_t* margFactors=SG_MALLOC(float64_t, degree);
01397
01398 int32_t* x = SG_MALLOC(int32_t, degree+1 );
01399 int32_t* substrs = SG_MALLOC(int32_t, degree+1 );
01400
01401 margFactors[0] = 1.0;
01402 substrs[0] = 0;
01403 for( k=1; k < degree; ++k ) {
01404 margFactors[k] = 0.25 * margFactors[k-1];
01405 substrs[k] = -1;
01406 }
01407 substrs[degree] = -1;
01408
01409 struct TreeParseInfo info;
01410 info.num_sym = num_sym;
01411 info.num_feat = num_feat;
01412 info.p = -1;
01413 info.k = -1;
01414 info.nofsKmers = nofsKmers;
01415 info.margFactors = margFactors;
01416 info.x = x;
01417 info.substrs = substrs;
01418 info.y0 = 0;
01419 info.C_k = NULL;
01420 info.L_k = NULL;
01421 info.R_k = NULL;
01422
01423
01424 i = 0;
01425 for( k = 0; k < max_degree; ++k )
01426 {
01427 const int32_t nofKmers = nofsKmers[ k ];
01428 info.C_k = C[k];
01429 info.L_k = L[k];
01430 info.R_k = R[k];
01431
01432
01433 for(int32_t p = 0; p < num_feat; ++p )
01434 {
01435 init_optimization( num_suppvec, IDX, alphas, p );
01436 int32_t tree = p ;
01437 for(int32_t j = 0; j < degree+1; j++ ) {
01438 x[j] = -1;
01439 }
01440 tries.traverse( tree, p, info, 0, x, k );
01441 SG_PROGRESS(i++,0,num_feat*max_degree);
01442 }
01443
01444
01445 if( k > 0 ) {
01446 const int32_t j = k - 1;
01447 const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
01448 for(int32_t p = 0; p < num_feat; ++p ) {
01449 const int32_t offsetJ = nofJmers * p;
01450 const int32_t offsetJ1 = nofJmers * (p+1);
01451 const int32_t offsetK = nofKmers * p;
01452 int32_t y;
01453 int32_t sym;
01454 for( y = 0; y < nofJmers; ++y ) {
01455 for( sym = 0; sym < num_sym; ++sym ) {
01456 const int32_t y_sym = num_sym*y + sym;
01457 const int32_t sym_y = nofJmers*sym + y;
01458 ASSERT(0<=y_sym && y_sym<nofKmers);
01459 ASSERT(0<=sym_y && sym_y<nofKmers);
01460 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01461 if( p < num_feat-1 ) {
01462 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01463 }
01464 }
01465 }
01466 }
01467 }
01468
01469
01470
01471
01472
01473
01474
01475
01476
01477
01478
01479 }
01480
01481
01482 num_feat=1;
01483 num_sym = bigtabSize;
01484
01485 SG_FREE(nofsKmers);
01486 SG_FREE(margFactors);
01487 SG_FREE(substrs);
01488 SG_FREE(x);
01489 SG_FREE(C);
01490 for( k = 0; k < max_degree; ++k ) {
01491 SG_FREE(L[k]);
01492 SG_FREE(R[k]);
01493 }
01494 SG_FREE(L);
01495 SG_FREE(R);
01496 return result;
01497 }
01498
01499 char* CWeightedDegreePositionStringKernel::compute_consensus(
01500 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01501 {
01502 ASSERT(position_weights_lhs==NULL);
01503 ASSERT(position_weights_rhs==NULL);
01504
01505 ASSERT(degree<=32);
01506 ASSERT(!tries.get_use_compact_terminal_nodes());
01507 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01508 ASSERT(num_feat>0);
01509 ASSERT(alphabet);
01510 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01511
01512
01513 char* result=SG_MALLOC(char, num_feat);
01514
01515
01516 int32_t num_tables=CMath::max(1,num_feat-degree+1);
01517 DynArray<ConsensusEntry>** table=SG_MALLOC(DynArray<ConsensusEntry>*, num_tables);
01518
01519 for (int32_t i=0; i<num_tables; i++)
01520 table[i]=new DynArray<ConsensusEntry>(num_suppvec/10);
01521
01522
01523 for (int32_t i=0; i<num_tables; i++)
01524 {
01525 bool cumulative=false;
01526
01527 if (i<num_tables-1)
01528 init_optimization(num_suppvec, IDX, alphas, i);
01529 else
01530 {
01531 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01532 cumulative=true;
01533 }
01534
01535 if (i==0)
01536 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01537 else
01538 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01539
01540 SG_PROGRESS(i,0,num_feat);
01541 }
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565 const char* acgt="ACGT";
01566
01567
01568 int32_t max_idx=-1;
01569 float32_t max_score=0;
01570 int32_t num_elements=table[num_tables-1]->get_num_elements();
01571
01572 for (int32_t i=0; i<num_elements; i++)
01573 {
01574 float64_t sc=table[num_tables-1]->get_element(i).score;
01575 if (sc>max_score || max_idx==-1)
01576 {
01577 max_idx=i;
01578 max_score=sc;
01579 }
01580 }
01581 uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
01582
01583 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01584
01585 for (int32_t i=0; i<degree; i++)
01586 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01587
01588 if (num_tables>1)
01589 {
01590 for (int32_t i=num_tables-1; i>=0; i--)
01591 {
01592
01593 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01594 max_idx=table[i]->get_element(max_idx).bt;
01595 }
01596 }
01597
01598
01599
01600
01601
01602
01603
01604
01605
01606
01607
01608 for (int32_t i=0; i<num_tables; i++)
01609 delete table[i];
01610
01611 SG_FREE(table);
01612 return result;
01613 }
01614
01615
01616 float64_t* CWeightedDegreePositionStringKernel::extract_w(
01617 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01618 float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01619 {
01620 delete_optimization();
01621 use_poim_tries=true;
01622 poim_tries.delete_trees(false);
01623
01624
01625 ASSERT(position_weights_lhs==NULL);
01626 ASSERT(position_weights_rhs==NULL);
01627 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01628 ASSERT(num_feat>0);
01629 ASSERT(alphabet->get_alphabet()==DNA);
01630 ASSERT(max_degree>0);
01631
01632
01633 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01634 const int32_t seqLen = num_feat;
01635 float64_t** subs;
01636 int32_t i;
01637 int32_t k;
01638
01639
01640
01641
01642 int32_t* offsets;
01643 int32_t offset;
01644 offsets = SG_MALLOC(int32_t, max_degree );
01645 offset = 0;
01646 for( k = 0; k < max_degree; ++k ) {
01647 offsets[k] = offset;
01648 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01649 const int32_t tabSize = nofsKmers * seqLen;
01650 offset += tabSize;
01651 }
01652
01653 const int32_t bigTabSize = offset;
01654 w_result=SG_MALLOC(float64_t, bigTabSize);
01655 for (i=0; i<bigTabSize; ++i)
01656 w_result[i]=0;
01657
01658
01659 subs = SG_MALLOC(float64_t*, max_degree );
01660 ASSERT( subs != NULL );
01661 for( k = 0; k < max_degree; ++k ) {
01662 subs[k] = &w_result[ offsets[k] ];
01663 }
01664 SG_FREE(offsets);
01665
01666
01667 init_optimization( num_suppvec, IDX, alphas, -1);
01668 poim_tries.POIMs_extract_W( subs, max_degree );
01669
01670
01671 SG_FREE(subs);
01672 num_feat = 1;
01673 num_sym = bigTabSize;
01674 use_poim_tries=false;
01675 poim_tries.delete_trees(false);
01676 return w_result;
01677 }
01678
01679 float64_t* CWeightedDegreePositionStringKernel::compute_POIM(
01680 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01681 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
01682 float64_t* alphas, float64_t* distrib )
01683 {
01684 delete_optimization();
01685 use_poim_tries=true;
01686 poim_tries.delete_trees(false);
01687
01688
01689 ASSERT(position_weights_lhs==NULL);
01690 ASSERT(position_weights_rhs==NULL);
01691 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01692 ASSERT(num_feat>0);
01693 ASSERT(alphabet->get_alphabet()==DNA);
01694 ASSERT(max_degree!=0);
01695 ASSERT(distrib);
01696
01697
01698 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01699 const int32_t seqLen = num_feat;
01700 float64_t** subs;
01701 int32_t i;
01702 int32_t k;
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715 const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01716 if( debug ) {
01717 max_degree = abs(max_degree) / 4;
01718 switch( debug ) {
01719 case 1: {
01720 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01721 break;
01722 }
01723 case 2: {
01724 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01725 break;
01726 }
01727 case 3: {
01728 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01729 break;
01730 }
01731 case 4: {
01732 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01733 break;
01734 }
01735 default: {
01736 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01737 ASSERT(0);
01738 break;
01739 }
01740 }
01741 }
01742
01743
01744 int32_t* offsets;
01745 int32_t offset;
01746 offsets = SG_MALLOC(int32_t, max_degree );
01747 offset = 0;
01748 for( k = 0; k < max_degree; ++k ) {
01749 offsets[k] = offset;
01750 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01751 const int32_t tabSize = nofsKmers * seqLen;
01752 offset += tabSize;
01753 }
01754
01755 const int32_t bigTabSize=offset;
01756 poim_result=SG_MALLOC(float64_t, bigTabSize);
01757 for (i=0; i<bigTabSize; ++i )
01758 poim_result[i]=0;
01759
01760
01761 subs=SG_MALLOC(float64_t*, max_degree);
01762 for (k=0; k<max_degree; ++k)
01763 subs[k]=&poim_result[offsets[k]];
01764
01765 SG_FREE(offsets);
01766
01767
01768 init_optimization( num_suppvec, IDX, alphas, -1);
01769 poim_tries.POIMs_precalc_SLR( distrib );
01770
01771
01772 if( debug==0 || debug==1 ) {
01773 poim_tries.POIMs_extract_W( subs, max_degree );
01774 for( k = 1; k < max_degree; ++k ) {
01775 const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
01776 const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
01777 const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
01778 for( i = 0; i < seqLen; ++i ) {
01779 float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01780 float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01781 float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01782 float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01783 int32_t y0;
01784 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01785 const int32_t y1l = y0 / NUM_SYMS;
01786 const int32_t y1r = y0 % nofKmers1;
01787 const int32_t y2 = y1r / NUM_SYMS;
01788 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01789 if( i < seqLen-1 ) {
01790 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01791 if( k > 1 ) {
01792 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01793 }
01794 }
01795 }
01796 }
01797 }
01798 }
01799
01800
01801 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01802
01803
01804 SG_FREE(subs);
01805 num_feat = 1;
01806 num_sym = bigTabSize;
01807
01808 use_poim_tries=false;
01809 poim_tries.delete_trees(false);
01810
01811 return poim_result;
01812 }
01813
01814
01815 void CWeightedDegreePositionStringKernel::prepare_POIM2(
01816 float64_t* distrib, int32_t num_sym, int32_t num_feat)
01817 {
01818 SG_FREE(m_poim_distrib);
01819 m_poim_distrib=SG_MALLOC(float64_t, num_sym*num_feat);
01820
01821 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(float64_t));
01822 m_poim_num_sym=num_sym;
01823 m_poim_num_feat=num_feat;
01824 }
01825
01826 void CWeightedDegreePositionStringKernel::compute_POIM2(
01827 int32_t max_degree, CSVM* svm)
01828 {
01829 ASSERT(svm);
01830 int32_t num_suppvec=svm->get_num_support_vectors();
01831 int32_t* sv_idx=SG_MALLOC(int32_t, num_suppvec);
01832 float64_t* sv_weight=SG_MALLOC(float64_t, num_suppvec);
01833
01834 for (int32_t i=0; i<num_suppvec; i++)
01835 {
01836 sv_idx[i]=svm->get_support_vector(i);
01837 sv_weight[i]=svm->get_alpha(i);
01838 }
01839
01840 if ((max_degree < 1) || (max_degree > 12))
01841 {
01842
01843 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01844 max_degree=1;
01845 }
01846
01847 int32_t num_feat = m_poim_num_feat;
01848 int32_t num_sym = m_poim_num_sym;
01849 SG_FREE(m_poim);
01850
01851 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01852 sv_weight, m_poim_distrib);
01853
01854 ASSERT(num_feat==1);
01855 m_poim_result_len=num_sym;
01856
01857 SG_FREE(sv_weight);
01858 SG_FREE(sv_idx);
01859 }
01860
01861 void CWeightedDegreePositionStringKernel::get_POIM2(
01862 float64_t** poim, int32_t* result_len)
01863 {
01864 *poim=SG_MALLOC(float64_t, m_poim_result_len);
01865 ASSERT(*poim);
01866 memcpy(*poim, m_poim, m_poim_result_len*sizeof(float64_t)) ;
01867 *result_len=m_poim_result_len ;
01868 }
01869
01870 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01871 {
01872 SG_FREE(m_poim) ;
01873 m_poim=NULL ;
01874 SG_FREE(m_poim_distrib) ;
01875 m_poim_distrib=NULL ;
01876 m_poim_num_sym=0 ;
01877 m_poim_num_sym=0 ;
01878 m_poim_result_len=0 ;
01879 }
01880
01881 void CWeightedDegreePositionStringKernel::load_serializable_post() throw (ShogunException)
01882 {
01883 CKernel::load_serializable_post();
01884
01885 tries=CTrie<DNATrie>(degree);
01886 poim_tries=CTrie<POIMTrie>(degree);
01887
01888 if (weights)
01889 init_block_weights();
01890 }
01891
01892 void CWeightedDegreePositionStringKernel::init()
01893 {
01894 weights=NULL;
01895 position_weights=NULL;
01896 position_weights_len=0;
01897
01898 position_weights_lhs=NULL;
01899 position_weights_lhs_len=0;
01900 position_weights_rhs=NULL;
01901 position_weights_rhs_len=0;
01902
01903 weights_buffer=NULL;
01904 mkl_stepsize=1;
01905 degree=1;
01906 length=0;
01907
01908 max_shift=0;
01909 max_mismatch=0;
01910 seq_length=0;
01911 shift=NULL;
01912 shift_len=0;
01913
01914 block_weights=NULL;
01915 block_computation=true;
01916 type=E_EXTERNAL;
01917 which_degree=-1;
01918 tries=CTrie<DNATrie>(1);
01919 poim_tries=CTrie<POIMTrie>(1);
01920
01921 tree_initialized=false;
01922 use_poim_tries=false;
01923 m_poim_distrib=NULL;
01924
01925 m_poim=NULL;
01926 m_poim_num_sym=0;
01927 m_poim_num_feat=0;
01928 m_poim_result_len=0;
01929
01930 alphabet=NULL;
01931
01932 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01933
01934 set_normalizer(new CSqrtDiagKernelNormalizer());
01935
01936 m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01937 "weights", "WD Kernel weights.");
01938 m_parameters->add_vector(&position_weights, &position_weights_len,
01939 "position_weights",
01940 "Weights per position.");
01941 m_parameters->add_vector(&position_weights_lhs, &position_weights_lhs_len,
01942 "position_weights_lhs",
01943 "Weights per position left hand side.");
01944 m_parameters->add_vector(&position_weights_rhs, &position_weights_rhs_len,
01945 "position_weights_rhs",
01946 "Weights per position right hand side.");
01947 m_parameters->add_vector(&shift, &shift_len,
01948 "shift",
01949 "Shift Vector.");
01950 m_parameters->add(&mkl_stepsize, "mkl_stepsize", "MKL step size.");
01951 m_parameters->add(°ree, "degree", "Order of WD kernel.");
01952 m_parameters->add(&max_mismatch, "max_mismatch",
01953 "Number of allowed mismatches.");
01954 m_parameters->add(&block_computation, "block_computation",
01955 "If block computation shall be used.");
01956 m_parameters->add((machine_int_t*) &type, "type",
01957 "WeightedDegree kernel type.");
01958 m_parameters->add(&which_degree, "which_degree",
01959 "Unqueal -1 if just a single degree is selected.");
01960 m_parameters->add((CSGObject**) &alphabet, "alphabet",
01961 "Alphabet of Features.");
01962 }