00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 #include <shogun/lib/Signal.h>
00015 #include <shogun/lib/Trie.h>
00016 #include <shogun/base/Parameter.h>
00017 #include <shogun/base/Parallel.h>
00018
00019 #include <shogun/kernel/string/WeightedDegreeStringKernel.h>
00020 #include <shogun/kernel/normalizer/FirstElementKernelNormalizer.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027
00028 using namespace shogun;
00029
00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00031 struct S_THREAD_PARAM
00032 {
00033
00034 int32_t* vec;
00035 float64_t* result;
00036 float64_t* weights;
00037 CWeightedDegreeStringKernel* kernel;
00038 CTrie<DNATrie>* tries;
00039 float64_t factor;
00040 int32_t j;
00041 int32_t start;
00042 int32_t end;
00043 int32_t length;
00044 int32_t* vec_idx;
00045 };
00046 #endif // DOXYGEN_SHOULD_SKIP_THIS
00047
00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ()
00049 : CStringKernel<char>()
00050 {
00051 init();
00052 }
00053
00054
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056 int32_t d, EWDKernType t)
00057 : CStringKernel<char>()
00058 {
00059 init();
00060
00061 degree=d;
00062 type=t;
00063
00064 if (type!=E_EXTERNAL)
00065 set_wd_weights_by_type(type);
00066 }
00067
00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(SGVector<float64_t> w)
00069 : CStringKernel<char>(10)
00070 {
00071 init();
00072
00073 type=E_EXTERNAL;
00074 degree=w.vlen;
00075
00076 weights=SG_MALLOC(float64_t, degree*(1+max_mismatch));
00077 weights_degree=degree;
00078 weights_length=(1+max_mismatch);
00079
00080 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00081 weights[i]=w.vector[i];
00082 }
00083
00084 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00085 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00086 : CStringKernel<char>(10)
00087 {
00088 init();
00089 degree=d;
00090 type=E_WD;
00091 set_wd_weights_by_type(type);
00092 set_normalizer(new CFirstElementKernelNormalizer());
00093 init(l, r);
00094 }
00095
00096 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00097 {
00098 cleanup();
00099
00100 SG_FREE(weights);
00101 weights=NULL;
00102 weights_degree=0;
00103 weights_length=0;
00104
00105 SG_FREE(block_weights);
00106 block_weights=NULL;
00107
00108 SG_FREE(position_weights);
00109 position_weights=NULL;
00110
00111 SG_FREE(weights_buffer);
00112 weights_buffer=NULL;
00113 }
00114
00115
00116 void CWeightedDegreeStringKernel::remove_lhs()
00117 {
00118 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00119 delete_optimization();
00120
00121 if (tries!=NULL)
00122 tries->destroy();
00123
00124 CKernel::remove_lhs();
00125 }
00126
00127 void CWeightedDegreeStringKernel::create_empty_tries()
00128 {
00129 ASSERT(lhs);
00130
00131 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00132
00133 if (tries!=NULL)
00134 {
00135 tries->destroy() ;
00136 tries->create(seq_length, max_mismatch==0) ;
00137 }
00138 }
00139
00140 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00141 {
00142 int32_t lhs_changed=(lhs!=l);
00143 int32_t rhs_changed=(rhs!=r);
00144
00145 CStringKernel<char>::init(l,r);
00146
00147 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00148 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00149
00150 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00151 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00152
00153 int32_t len=sf_l->get_max_vector_length();
00154 if (lhs_changed && !sf_l->have_same_length(len))
00155 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00156
00157 if (rhs_changed && !sf_r->have_same_length(len))
00158 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00159
00160 SG_UNREF(alphabet);
00161 alphabet=sf_l->get_alphabet();
00162 CAlphabet* ralphabet=sf_r->get_alphabet();
00163
00164 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00165 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00166
00167 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00168 SG_UNREF(ralphabet);
00169
00170 if (tries!=NULL) {
00171 tries->delete_trees(max_mismatch==0);
00172 SG_UNREF(tries);
00173 }
00174 tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00175 create_empty_tries();
00176
00177 init_block_weights();
00178
00179 return init_normalizer();
00180 }
00181
00182 void CWeightedDegreeStringKernel::cleanup()
00183 {
00184 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00185 delete_optimization();
00186
00187 SG_FREE(block_weights);
00188 block_weights=NULL;
00189
00190 if (tries!=NULL)
00191 {
00192 tries->destroy();
00193 SG_UNREF(tries);
00194 tries=NULL;
00195 }
00196
00197 seq_length=0;
00198 tree_initialized = false;
00199
00200 SG_UNREF(alphabet);
00201 alphabet=NULL;
00202
00203 CKernel::cleanup();
00204 }
00205
00206 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00207 {
00208 if (tree_num<0)
00209 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00210
00211 delete_optimization();
00212
00213 if (tree_num<0)
00214 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00215
00216 for (int32_t i=0; i<count; i++)
00217 {
00218 if (tree_num<0)
00219 {
00220 if ( (i % (count/10+1)) == 0)
00221 SG_PROGRESS(i, 0, count);
00222
00223 if (max_mismatch==0)
00224 add_example_to_tree(IDX[i], alphas[i]) ;
00225 else
00226 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00227
00228
00229 }
00230 else
00231 {
00232 if (max_mismatch==0)
00233 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00234 else
00235 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00236 }
00237 }
00238
00239 if (tree_num<0)
00240 SG_DONE();
00241
00242
00243
00244 set_is_initialized(true) ;
00245 return true ;
00246 }
00247
00248 bool CWeightedDegreeStringKernel::delete_optimization()
00249 {
00250 if (get_is_initialized())
00251 {
00252 if (tries!=NULL)
00253 tries->delete_trees(max_mismatch==0);
00254 set_is_initialized(false);
00255 return true;
00256 }
00257
00258 return false;
00259 }
00260
00261
00262 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00263 char* avec, int32_t alen, char* bvec, int32_t blen)
00264 {
00265 float64_t sum = 0.0;
00266
00267 for (int32_t i=0; i<alen; i++)
00268 {
00269 float64_t sumi = 0.0;
00270 int32_t mismatches=0;
00271
00272 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00273 {
00274 if (avec[i+j]!=bvec[i+j])
00275 {
00276 mismatches++ ;
00277 if (mismatches>max_mismatch)
00278 break ;
00279 } ;
00280 sumi += weights[j+degree*mismatches];
00281 }
00282 if (position_weights!=NULL)
00283 sum+=position_weights[i]*sumi ;
00284 else
00285 sum+=sumi ;
00286 }
00287 return sum ;
00288 }
00289
00290 float64_t CWeightedDegreeStringKernel::compute_using_block(
00291 char* avec, int32_t alen, char* bvec, int32_t blen)
00292 {
00293 ASSERT(alen==blen);
00294
00295 float64_t sum=0;
00296 int32_t match_len=-1;
00297
00298 for (int32_t i=0; i<alen; i++)
00299 {
00300 if (avec[i]==bvec[i])
00301 match_len++;
00302 else
00303 {
00304 if (match_len>=0)
00305 sum+=block_weights[match_len];
00306 match_len=-1;
00307 }
00308 }
00309
00310 if (match_len>=0)
00311 sum+=block_weights[match_len];
00312
00313 return sum;
00314 }
00315
00316 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00317 char* avec, int32_t alen, char* bvec, int32_t blen)
00318 {
00319 float64_t sum = 0.0;
00320
00321 for (int32_t i=0; i<alen; i++)
00322 {
00323 float64_t sumi = 0.0;
00324
00325 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00326 {
00327 if (avec[i+j]!=bvec[i+j])
00328 break ;
00329 sumi += weights[j];
00330 }
00331 if (position_weights!=NULL)
00332 sum+=position_weights[i]*sumi ;
00333 else
00334 sum+=sumi ;
00335 }
00336 return sum ;
00337 }
00338
00339 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00340 char* avec, int32_t alen, char* bvec, int32_t blen)
00341 {
00342 float64_t sum = 0.0;
00343
00344 for (int32_t i=0; i<alen; i++)
00345 {
00346 float64_t sumi=0.0;
00347 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00348 {
00349 if (avec[i+j]!=bvec[i+j])
00350 break;
00351 sumi += weights[i*degree+j];
00352 }
00353 if (position_weights!=NULL)
00354 sum += position_weights[i]*sumi ;
00355 else
00356 sum += sumi ;
00357 }
00358
00359 return sum ;
00360 }
00361
00362
00363 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00364 {
00365 int32_t alen, blen;
00366 bool free_avec, free_bvec;
00367 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00368 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00369 float64_t result=0;
00370
00371 if (max_mismatch==0 && length==0 && block_computation)
00372 result=compute_using_block(avec, alen, bvec, blen);
00373 else
00374 {
00375 if (max_mismatch>0)
00376 result=compute_with_mismatch(avec, alen, bvec, blen);
00377 else if (length==0)
00378 result=compute_without_mismatch(avec, alen, bvec, blen);
00379 else
00380 result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00381 }
00382 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00383 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00384
00385 return result;
00386 }
00387
00388
00389 void CWeightedDegreeStringKernel::add_example_to_tree(
00390 int32_t idx, float64_t alpha)
00391 {
00392 ASSERT(alphabet);
00393 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00394
00395 int32_t len=0;
00396 bool free_vec;
00397 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00398 ASSERT(max_mismatch==0);
00399 int32_t *vec=SG_MALLOC(int32_t, len);
00400
00401 for (int32_t i=0; i<len; i++)
00402 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00403 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00404
00405 if (length == 0 || max_mismatch > 0)
00406 {
00407 for (int32_t i=0; i<len; i++)
00408 {
00409 float64_t alpha_pw=alpha;
00410
00411
00412 if (alpha_pw==0.0)
00413 continue;
00414 ASSERT(tries);
00415 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00416 }
00417 }
00418 else
00419 {
00420 for (int32_t i=0; i<len; i++)
00421 {
00422 float64_t alpha_pw=alpha;
00423
00424
00425 if (alpha_pw==0.0)
00426 continue ;
00427 ASSERT(tries);
00428 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00429 }
00430 }
00431 SG_FREE(vec);
00432 tree_initialized=true ;
00433 }
00434
00435 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00436 int32_t idx, float64_t alpha, int32_t tree_num)
00437 {
00438 ASSERT(alphabet);
00439 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00440
00441 int32_t len;
00442 bool free_vec;
00443 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00444 ASSERT(max_mismatch==0);
00445 int32_t *vec = SG_MALLOC(int32_t, len);
00446
00447 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00448 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00449 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00450
00451
00452 ASSERT(tries);
00453 if (alpha!=0.0)
00454 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00455
00456 SG_FREE(vec);
00457 tree_initialized=true ;
00458 }
00459
00460 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00461 {
00462 ASSERT(tries);
00463 ASSERT(alphabet);
00464 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00465
00466 int32_t len ;
00467 bool free_vec;
00468 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00469
00470 int32_t *vec = SG_MALLOC(int32_t, len);
00471
00472 for (int32_t i=0; i<len; i++)
00473 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00474 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00475
00476 for (int32_t i=0; i<len; i++)
00477 {
00478 if (alpha!=0.0)
00479 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00480 }
00481
00482 SG_FREE(vec);
00483 tree_initialized=true ;
00484 }
00485
00486 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00487 int32_t idx, float64_t alpha, int32_t tree_num)
00488 {
00489 ASSERT(tries);
00490 ASSERT(alphabet);
00491 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00492
00493 int32_t len=0;
00494 bool free_vec;
00495 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00496 int32_t *vec=SG_MALLOC(int32_t, len);
00497
00498 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00499 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00500 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00501
00502 if (alpha!=0.0)
00503 {
00504 tries->add_example_to_tree_mismatch_recursion(
00505 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00506 0, 0, max_mismatch, weights);
00507 }
00508
00509 SG_FREE(vec);
00510 tree_initialized=true;
00511 }
00512
00513
00514 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00515 {
00516 ASSERT(alphabet);
00517 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00518
00519 int32_t len=0;
00520 bool free_vec;
00521 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00522 ASSERT(char_vec && len>0);
00523 int32_t *vec=SG_MALLOC(int32_t, len);
00524
00525 for (int32_t i=0; i<len; i++)
00526 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00527 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00528
00529 float64_t sum=0;
00530 ASSERT(tries);
00531 for (int32_t i=0; i<len; i++)
00532 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00533
00534 SG_FREE(vec);
00535 return normalizer->normalize_rhs(sum, idx);
00536 }
00537
00538 void CWeightedDegreeStringKernel::compute_by_tree(
00539 int32_t idx, float64_t* LevelContrib)
00540 {
00541 ASSERT(alphabet);
00542 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00543
00544 int32_t len ;
00545 bool free_vec;
00546 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00547
00548 int32_t *vec = SG_MALLOC(int32_t, len);
00549
00550 for (int32_t i=0; i<len; i++)
00551 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00552 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00553
00554 ASSERT(tries);
00555 for (int32_t i=0; i<len; i++)
00556 {
00557 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00558 normalizer->normalize_rhs(1.0, idx),
00559 mkl_stepsize, weights, (length!=0));
00560 }
00561
00562 SG_FREE(vec);
00563 }
00564
00565 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00566 {
00567 ASSERT(tries);
00568 return tries->compute_abs_weights(len);
00569 }
00570
00571 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00572 {
00573 ASSERT(degree>0);
00574 ASSERT(p_type==E_WD);
00575
00576 SG_FREE(weights);
00577 weights=SG_MALLOC(float64_t, degree);
00578 weights_degree=degree;
00579 weights_length=1;
00580
00581 if (weights)
00582 {
00583 int32_t i;
00584 float64_t sum=0;
00585 for (i=0; i<degree; i++)
00586 {
00587 weights[i]=degree-i;
00588 sum+=weights[i];
00589 }
00590 for (i=0; i<degree; i++)
00591 weights[i]/=sum;
00592
00593 for (i=0; i<degree; i++)
00594 {
00595 for (int32_t j=1; j<=max_mismatch; j++)
00596 {
00597 if (j<i+1)
00598 {
00599 int32_t nk=CMath::nchoosek(i+1, j);
00600 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00601 }
00602 else
00603 weights[i+j*degree]= 0;
00604 }
00605 }
00606
00607 if (which_degree>=0)
00608 {
00609 ASSERT(which_degree<degree);
00610 for (i=0; i<degree; i++)
00611 {
00612 if (i!=which_degree)
00613 weights[i]=0;
00614 else
00615 weights[i]=1;
00616 }
00617 }
00618 return true;
00619 }
00620 else
00621 return false;
00622 }
00623
00624 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights)
00625 {
00626 float64_t* ws=new_weights.matrix;
00627 int32_t d=new_weights.num_rows;
00628 int32_t len=new_weights.num_cols;
00629
00630 if (d!=degree || len<0)
00631 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00632
00633 degree=d;
00634 length=len;
00635
00636 if (len <= 0)
00637 len=1;
00638
00639 weights_degree=degree;
00640 weights_length=len+max_mismatch;
00641
00642
00643 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00644 int32_t num_weights=weights_degree*weights_length;
00645 SG_FREE(weights);
00646 weights=SG_MALLOC(float64_t, num_weights);
00647
00648 for (int32_t i=0; i<degree*len; i++)
00649 weights[i]=ws[i];
00650
00651 return true;
00652 }
00653
00654 bool CWeightedDegreeStringKernel::set_position_weights(
00655 float64_t* pws, int32_t len)
00656 {
00657 if (len==0)
00658 {
00659 SG_FREE(position_weights);
00660 position_weights=NULL;
00661 ASSERT(tries);
00662 tries->set_position_weights(position_weights);
00663 }
00664
00665 if (seq_length!=len)
00666 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00667
00668 SG_FREE(position_weights);
00669 position_weights=SG_MALLOC(float64_t, len);
00670 position_weights_len=len;
00671 ASSERT(tries);
00672 tries->set_position_weights(position_weights);
00673
00674 if (position_weights)
00675 {
00676 for (int32_t i=0; i<len; i++)
00677 position_weights[i]=pws[i];
00678 return true;
00679 }
00680 else
00681 return false;
00682 }
00683
00684 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00685 {
00686 SG_FREE(block_weights);
00687 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00688
00689 int32_t k;
00690 float64_t d=degree;
00691
00692 for (k=0; k<degree; k++)
00693 block_weights[k]=
00694 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00695 for (k=degree; k<seq_length; k++)
00696 block_weights[k]=(-d+3*k+4)/3;
00697
00698 return true;
00699 }
00700
00701 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00702 {
00703 ASSERT(weights);
00704 SG_FREE(block_weights);
00705 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00706
00707 int32_t i=0;
00708 block_weights[0]=weights[0];
00709 for (i=1; i<CMath::max(seq_length,degree); i++)
00710 block_weights[i]=0;
00711
00712 for (i=1; i<CMath::max(seq_length,degree); i++)
00713 {
00714 block_weights[i]=block_weights[i-1];
00715
00716 float64_t contrib=0;
00717 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00718 contrib+=weights[j];
00719
00720 block_weights[i]+=contrib;
00721 }
00722 return true;
00723 }
00724
00725 bool CWeightedDegreeStringKernel::init_block_weights_const()
00726 {
00727 SG_FREE(block_weights);
00728 block_weights=SG_MALLOC(float64_t, seq_length);
00729
00730 for (int32_t i=1; i<seq_length+1 ; i++)
00731 block_weights[i-1]=1.0/seq_length;
00732 return true;
00733 }
00734
00735 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00736 {
00737 SG_FREE(block_weights);
00738 block_weights=SG_MALLOC(float64_t, seq_length);
00739
00740 for (int32_t i=1; i<seq_length+1 ; i++)
00741 block_weights[i-1]=degree*i;
00742
00743 return true;
00744 }
00745
00746 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00747 {
00748 SG_FREE(block_weights);
00749 block_weights=SG_MALLOC(float64_t, seq_length);
00750
00751 for (int32_t i=1; i<degree+1 ; i++)
00752 block_weights[i-1]=((float64_t) i)*i;
00753
00754 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00755 block_weights[i-1]=i;
00756
00757 return true;
00758 }
00759
00760 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00761 {
00762 SG_FREE(block_weights);
00763 block_weights=SG_MALLOC(float64_t, seq_length);
00764
00765 for (int32_t i=1; i<degree+1 ; i++)
00766 block_weights[i-1]=((float64_t) i)*i*i;
00767
00768 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00769 block_weights[i-1]=i;
00770 return true;
00771 }
00772
00773 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00774 {
00775 SG_FREE(block_weights);
00776 block_weights=SG_MALLOC(float64_t, seq_length);
00777
00778 for (int32_t i=1; i<degree+1 ; i++)
00779 block_weights[i-1]=exp(((float64_t) i/10.0));
00780
00781 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00782 block_weights[i-1]=i;
00783
00784 return true;
00785 }
00786
00787 bool CWeightedDegreeStringKernel::init_block_weights_log()
00788 {
00789 SG_FREE(block_weights);
00790 block_weights=SG_MALLOC(float64_t, seq_length);
00791
00792 for (int32_t i=1; i<degree+1 ; i++)
00793 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00794
00795 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00796 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00797
00798 return true;
00799 }
00800
00801 bool CWeightedDegreeStringKernel::init_block_weights()
00802 {
00803 switch (type)
00804 {
00805 case E_WD:
00806 return init_block_weights_from_wd();
00807 case E_EXTERNAL:
00808 return init_block_weights_from_wd_external();
00809 case E_BLOCK_CONST:
00810 return init_block_weights_const();
00811 case E_BLOCK_LINEAR:
00812 return init_block_weights_linear();
00813 case E_BLOCK_SQPOLY:
00814 return init_block_weights_sqpoly();
00815 case E_BLOCK_CUBICPOLY:
00816 return init_block_weights_cubicpoly();
00817 case E_BLOCK_EXP:
00818 return init_block_weights_exp();
00819 case E_BLOCK_LOG:
00820 return init_block_weights_log();
00821 };
00822 return false;
00823 }
00824
00825
00826 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00827 {
00828 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00829 int32_t j=params->j;
00830 CWeightedDegreeStringKernel* wd=params->kernel;
00831 CTrie<DNATrie>* tries=params->tries;
00832 float64_t* weights=params->weights;
00833 int32_t length=params->length;
00834 int32_t* vec=params->vec;
00835 float64_t* result=params->result;
00836 float64_t factor=params->factor;
00837 int32_t* vec_idx=params->vec_idx;
00838
00839 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00840 CAlphabet* alpha=wd->alphabet;
00841
00842 for (int32_t i=params->start; i<params->end; i++)
00843 {
00844 int32_t len=0;
00845 bool free_vec;
00846 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00847 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00848 vec[k]=alpha->remap_to_bin(char_vec[k]);
00849 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00850
00851 ASSERT(tries);
00852
00853 result[i]+=factor*
00854 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00855 }
00856
00857 SG_UNREF(rhs_feat);
00858
00859 return NULL;
00860 }
00861
00862 void CWeightedDegreeStringKernel::compute_batch(
00863 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00864 int32_t* IDX, float64_t* alphas, float64_t factor)
00865 {
00866 ASSERT(tries);
00867 ASSERT(alphabet);
00868 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00869 ASSERT(rhs);
00870 ASSERT(num_vec<=rhs->get_num_vectors());
00871 ASSERT(num_vec>0);
00872 ASSERT(vec_idx);
00873 ASSERT(result);
00874 create_empty_tries();
00875
00876 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00877 ASSERT(num_feat>0);
00878 int32_t num_threads=parallel->get_num_threads();
00879 ASSERT(num_threads>0);
00880 int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
00881
00882 if (num_threads < 2)
00883 {
00884 #ifdef CYGWIN
00885 for (int32_t j=0; j<num_feat; j++)
00886 #else
00887 CSignal::clear_cancel();
00888 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00889 #endif
00890 {
00891 init_optimization(num_suppvec, IDX, alphas, j);
00892 S_THREAD_PARAM params;
00893 params.vec=vec;
00894 params.result=result;
00895 params.weights=weights;
00896 params.kernel=this;
00897 params.tries=tries;
00898 params.factor=factor;
00899 params.j=j;
00900 params.start=0;
00901 params.end=num_vec;
00902 params.length=length;
00903 params.vec_idx=vec_idx;
00904 compute_batch_helper((void*) ¶ms);
00905
00906 SG_PROGRESS(j,0,num_feat);
00907 }
00908 }
00909 #ifdef HAVE_PTHREAD
00910 else
00911 {
00912 CSignal::clear_cancel();
00913 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00914 {
00915 init_optimization(num_suppvec, IDX, alphas, j);
00916 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00917 S_THREAD_PARAM* params = SG_MALLOC(S_THREAD_PARAM, num_threads);
00918 int32_t step= num_vec/num_threads;
00919 int32_t t;
00920
00921 for (t=0; t<num_threads-1; t++)
00922 {
00923 params[t].vec=&vec[num_feat*t];
00924 params[t].result=result;
00925 params[t].weights=weights;
00926 params[t].kernel=this;
00927 params[t].tries=tries;
00928 params[t].factor=factor;
00929 params[t].j=j;
00930 params[t].start = t*step;
00931 params[t].end = (t+1)*step;
00932 params[t].length=length;
00933 params[t].vec_idx=vec_idx;
00934 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]);
00935 }
00936 params[t].vec=&vec[num_feat*t];
00937 params[t].result=result;
00938 params[t].weights=weights;
00939 params[t].kernel=this;
00940 params[t].tries=tries;
00941 params[t].factor=factor;
00942 params[t].j=j;
00943 params[t].start=t*step;
00944 params[t].end=num_vec;
00945 params[t].length=length;
00946 params[t].vec_idx=vec_idx;
00947 compute_batch_helper((void*) ¶ms[t]);
00948
00949 for (t=0; t<num_threads-1; t++)
00950 pthread_join(threads[t], NULL);
00951 SG_PROGRESS(j,0,num_feat);
00952
00953 SG_FREE(params);
00954 SG_FREE(threads);
00955 }
00956 }
00957 #endif
00958
00959 SG_FREE(vec);
00960
00961
00962
00963 create_empty_tries();
00964 }
00965
00966 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00967 {
00968 if (type==E_EXTERNAL && max!=0)
00969 return false;
00970
00971 max_mismatch=max;
00972
00973 if (lhs!=NULL && rhs!=NULL)
00974 return init(lhs, rhs);
00975 else
00976 return true;
00977 }
00978
00979 void CWeightedDegreeStringKernel::init()
00980 {
00981 weights=NULL;
00982 weights_degree=0;
00983 weights_length=0;
00984
00985 position_weights=NULL;
00986 position_weights_len=0;
00987
00988 weights_buffer=NULL;
00989 mkl_stepsize=1;
00990 degree=1;
00991 length=0;
00992
00993 max_mismatch=0;
00994 seq_length=0;
00995
00996 block_weights=NULL;
00997 block_computation=true;
00998 type=E_WD;
00999 which_degree=-1;
01000 tries=NULL;
01001
01002 tree_initialized=false;
01003 alphabet=NULL;
01004
01005 lhs=NULL;
01006 rhs=NULL;
01007
01008 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01009
01010 set_normalizer(new CFirstElementKernelNormalizer());
01011
01012 m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01013 "weights", "WD Kernel weights.");
01014 m_parameters->add_vector(&position_weights, &position_weights_len,
01015 "position_weights",
01016 "Weights per position.");
01017 SG_ADD(&mkl_stepsize, "mkl_stepsize", "MKL step size.", MS_AVAILABLE);
01018 SG_ADD(°ree, "degree", "Order of WD kernel.", MS_AVAILABLE);
01019 SG_ADD(&max_mismatch, "max_mismatch",
01020 "Number of allowed mismatches.", MS_AVAILABLE);
01021 SG_ADD(&block_computation, "block_computation",
01022 "If block computation shall be used.", MS_NOT_AVAILABLE);
01023 SG_ADD((machine_int_t*) &type, "type",
01024 "WeightedDegree kernel type.", MS_AVAILABLE);
01025 SG_ADD(&which_degree, "which_degree",
01026 "The selected degree. All degrees are used by default (for value -1).",
01027 MS_AVAILABLE);
01028 SG_ADD((CSGObject**) &alphabet, "alphabet",
01029 "Alphabet of Features.", MS_NOT_AVAILABLE);
01030 }