00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 #include <shogun/lib/Signal.h>
00015 #include <shogun/lib/Trie.h>
00016 #include <shogun/base/Parameter.h>
00017 #include <shogun/base/Parallel.h>
00018
00019 #include <shogun/kernel/WeightedDegreeStringKernel.h>
00020 #include <shogun/kernel/FirstElementKernelNormalizer.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027
00028 using namespace shogun;
00029
00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00031 struct S_THREAD_PARAM
00032 {
00033
00034 int32_t* vec;
00035 float64_t* result;
00036 float64_t* weights;
00037 CWeightedDegreeStringKernel* kernel;
00038 CTrie<DNATrie>* tries;
00039 float64_t factor;
00040 int32_t j;
00041 int32_t start;
00042 int32_t end;
00043 int32_t length;
00044 int32_t* vec_idx;
00045 };
00046 #endif // DOXYGEN_SHOULD_SKIP_THIS
00047
00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ()
00049 : CStringKernel<char>()
00050 {
00051 init();
00052 }
00053
00054
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056 int32_t d, EWDKernType t)
00057 : CStringKernel<char>()
00058 {
00059 init();
00060
00061 degree=d;
00062 type=t;
00063
00064 if (type!=E_EXTERNAL)
00065 set_wd_weights_by_type(type);
00066 }
00067
00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00069 float64_t *w, int32_t d)
00070 : CStringKernel<char>(10)
00071 {
00072 init();
00073
00074 type=E_EXTERNAL;
00075 degree=d;
00076
00077 weights=SG_MALLOC(float64_t, degree*(1+max_mismatch));
00078 weights_degree=degree;
00079 weights_length=(1+max_mismatch);
00080
00081 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00082 weights[i]=w[i];
00083 }
00084
00085 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00086 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00087 : CStringKernel<char>(10)
00088 {
00089 init();
00090 degree=d;
00091 type=E_WD;
00092 set_wd_weights_by_type(type);
00093 set_normalizer(new CFirstElementKernelNormalizer());
00094 init(l, r);
00095 }
00096
00097 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00098 {
00099 cleanup();
00100
00101 SG_FREE(weights);
00102 weights=NULL;
00103 weights_degree=0;
00104 weights_length=0;
00105
00106 SG_FREE(block_weights);
00107 block_weights=NULL;
00108
00109 SG_FREE(position_weights);
00110 position_weights=NULL;
00111
00112 SG_FREE(weights_buffer);
00113 weights_buffer=NULL;
00114 }
00115
00116
00117 void CWeightedDegreeStringKernel::remove_lhs()
00118 {
00119 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00120 delete_optimization();
00121
00122 if (tries!=NULL)
00123 tries->destroy();
00124
00125 CKernel::remove_lhs();
00126 }
00127
00128 void CWeightedDegreeStringKernel::create_empty_tries()
00129 {
00130 ASSERT(lhs);
00131
00132 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00133
00134 if (tries!=NULL)
00135 {
00136 tries->destroy() ;
00137 tries->create(seq_length, max_mismatch==0) ;
00138 }
00139 }
00140
00141 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00142 {
00143 int32_t lhs_changed=(lhs!=l);
00144 int32_t rhs_changed=(rhs!=r);
00145
00146 CStringKernel<char>::init(l,r);
00147
00148 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00149 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00150
00151 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00152 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00153
00154 int32_t len=sf_l->get_max_vector_length();
00155 if (lhs_changed && !sf_l->have_same_length(len))
00156 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00157
00158 if (rhs_changed && !sf_r->have_same_length(len))
00159 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00160
00161 SG_UNREF(alphabet);
00162 alphabet=sf_l->get_alphabet();
00163 CAlphabet* ralphabet=sf_r->get_alphabet();
00164
00165 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00166 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00167
00168 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00169 SG_UNREF(ralphabet);
00170
00171 if (tries!=NULL) {
00172 tries->delete_trees(max_mismatch==0);
00173 SG_UNREF(tries);
00174 }
00175 tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00176 create_empty_tries();
00177
00178 init_block_weights();
00179
00180 return init_normalizer();
00181 }
00182
00183 void CWeightedDegreeStringKernel::cleanup()
00184 {
00185 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00186 delete_optimization();
00187
00188 SG_FREE(block_weights);
00189 block_weights=NULL;
00190
00191 if (tries!=NULL)
00192 {
00193 tries->destroy();
00194 SG_UNREF(tries);
00195 tries=NULL;
00196 }
00197
00198 seq_length=0;
00199 tree_initialized = false;
00200
00201 SG_UNREF(alphabet);
00202 alphabet=NULL;
00203
00204 CKernel::cleanup();
00205 }
00206
00207 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00208 {
00209 if (tree_num<0)
00210 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00211
00212 delete_optimization();
00213
00214 if (tree_num<0)
00215 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00216
00217 for (int32_t i=0; i<count; i++)
00218 {
00219 if (tree_num<0)
00220 {
00221 if ( (i % (count/10+1)) == 0)
00222 SG_PROGRESS(i, 0, count);
00223
00224 if (max_mismatch==0)
00225 add_example_to_tree(IDX[i], alphas[i]) ;
00226 else
00227 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00228
00229
00230 }
00231 else
00232 {
00233 if (max_mismatch==0)
00234 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00235 else
00236 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00237 }
00238 }
00239
00240 if (tree_num<0)
00241 SG_DONE();
00242
00243
00244
00245 set_is_initialized(true) ;
00246 return true ;
00247 }
00248
00249 bool CWeightedDegreeStringKernel::delete_optimization()
00250 {
00251 if (get_is_initialized())
00252 {
00253 if (tries!=NULL)
00254 tries->delete_trees(max_mismatch==0);
00255 set_is_initialized(false);
00256 return true;
00257 }
00258
00259 return false;
00260 }
00261
00262
00263 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00264 char* avec, int32_t alen, char* bvec, int32_t blen)
00265 {
00266 float64_t sum = 0.0;
00267
00268 for (int32_t i=0; i<alen; i++)
00269 {
00270 float64_t sumi = 0.0;
00271 int32_t mismatches=0;
00272
00273 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00274 {
00275 if (avec[i+j]!=bvec[i+j])
00276 {
00277 mismatches++ ;
00278 if (mismatches>max_mismatch)
00279 break ;
00280 } ;
00281 sumi += weights[j+degree*mismatches];
00282 }
00283 if (position_weights!=NULL)
00284 sum+=position_weights[i]*sumi ;
00285 else
00286 sum+=sumi ;
00287 }
00288 return sum ;
00289 }
00290
00291 float64_t CWeightedDegreeStringKernel::compute_using_block(
00292 char* avec, int32_t alen, char* bvec, int32_t blen)
00293 {
00294 ASSERT(alen==blen);
00295
00296 float64_t sum=0;
00297 int32_t match_len=-1;
00298
00299 for (int32_t i=0; i<alen; i++)
00300 {
00301 if (avec[i]==bvec[i])
00302 match_len++;
00303 else
00304 {
00305 if (match_len>=0)
00306 sum+=block_weights[match_len];
00307 match_len=-1;
00308 }
00309 }
00310
00311 if (match_len>=0)
00312 sum+=block_weights[match_len];
00313
00314 return sum;
00315 }
00316
00317 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00318 char* avec, int32_t alen, char* bvec, int32_t blen)
00319 {
00320 float64_t sum = 0.0;
00321
00322 for (int32_t i=0; i<alen; i++)
00323 {
00324 float64_t sumi = 0.0;
00325
00326 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00327 {
00328 if (avec[i+j]!=bvec[i+j])
00329 break ;
00330 sumi += weights[j];
00331 }
00332 if (position_weights!=NULL)
00333 sum+=position_weights[i]*sumi ;
00334 else
00335 sum+=sumi ;
00336 }
00337 return sum ;
00338 }
00339
00340 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00341 char* avec, int32_t alen, char* bvec, int32_t blen)
00342 {
00343 float64_t sum = 0.0;
00344
00345 for (int32_t i=0; i<alen; i++)
00346 {
00347 float64_t sumi=0.0;
00348 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00349 {
00350 if (avec[i+j]!=bvec[i+j])
00351 break;
00352 sumi += weights[i*degree+j];
00353 }
00354 if (position_weights!=NULL)
00355 sum += position_weights[i]*sumi ;
00356 else
00357 sum += sumi ;
00358 }
00359
00360 return sum ;
00361 }
00362
00363
00364 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00365 {
00366 int32_t alen, blen;
00367 bool free_avec, free_bvec;
00368 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00369 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00370 float64_t result=0;
00371
00372 if (max_mismatch==0 && length==0 && block_computation)
00373 result=compute_using_block(avec, alen, bvec, blen);
00374 else
00375 {
00376 if (max_mismatch>0)
00377 result=compute_with_mismatch(avec, alen, bvec, blen);
00378 else if (length==0)
00379 result=compute_without_mismatch(avec, alen, bvec, blen);
00380 else
00381 result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00382 }
00383 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00384 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00385
00386 return result;
00387 }
00388
00389
00390 void CWeightedDegreeStringKernel::add_example_to_tree(
00391 int32_t idx, float64_t alpha)
00392 {
00393 ASSERT(alphabet);
00394 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00395
00396 int32_t len=0;
00397 bool free_vec;
00398 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00399 ASSERT(max_mismatch==0);
00400 int32_t *vec=SG_MALLOC(int32_t, len);
00401
00402 for (int32_t i=0; i<len; i++)
00403 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00404 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00405
00406 if (length == 0 || max_mismatch > 0)
00407 {
00408 for (int32_t i=0; i<len; i++)
00409 {
00410 float64_t alpha_pw=alpha;
00411
00412
00413 if (alpha_pw==0.0)
00414 continue;
00415 ASSERT(tries);
00416 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00417 }
00418 }
00419 else
00420 {
00421 for (int32_t i=0; i<len; i++)
00422 {
00423 float64_t alpha_pw=alpha;
00424
00425
00426 if (alpha_pw==0.0)
00427 continue ;
00428 ASSERT(tries);
00429 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00430 }
00431 }
00432 SG_FREE(vec);
00433 tree_initialized=true ;
00434 }
00435
00436 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00437 int32_t idx, float64_t alpha, int32_t tree_num)
00438 {
00439 ASSERT(alphabet);
00440 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00441
00442 int32_t len;
00443 bool free_vec;
00444 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00445 ASSERT(max_mismatch==0);
00446 int32_t *vec = SG_MALLOC(int32_t, len);
00447
00448 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00449 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00450 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00451
00452
00453 ASSERT(tries);
00454 if (alpha!=0.0)
00455 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00456
00457 SG_FREE(vec);
00458 tree_initialized=true ;
00459 }
00460
00461 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00462 {
00463 ASSERT(tries);
00464 ASSERT(alphabet);
00465 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00466
00467 int32_t len ;
00468 bool free_vec;
00469 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00470
00471 int32_t *vec = SG_MALLOC(int32_t, len);
00472
00473 for (int32_t i=0; i<len; i++)
00474 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00475 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00476
00477 for (int32_t i=0; i<len; i++)
00478 {
00479 if (alpha!=0.0)
00480 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00481 }
00482
00483 SG_FREE(vec);
00484 tree_initialized=true ;
00485 }
00486
00487 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00488 int32_t idx, float64_t alpha, int32_t tree_num)
00489 {
00490 ASSERT(tries);
00491 ASSERT(alphabet);
00492 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00493
00494 int32_t len=0;
00495 bool free_vec;
00496 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00497 int32_t *vec=SG_MALLOC(int32_t, len);
00498
00499 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00500 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00501 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00502
00503 if (alpha!=0.0)
00504 {
00505 tries->add_example_to_tree_mismatch_recursion(
00506 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00507 0, 0, max_mismatch, weights);
00508 }
00509
00510 SG_FREE(vec);
00511 tree_initialized=true;
00512 }
00513
00514
00515 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00516 {
00517 ASSERT(alphabet);
00518 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00519
00520 int32_t len=0;
00521 bool free_vec;
00522 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00523 ASSERT(char_vec && len>0);
00524 int32_t *vec=SG_MALLOC(int32_t, len);
00525
00526 for (int32_t i=0; i<len; i++)
00527 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00528 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00529
00530 float64_t sum=0;
00531 ASSERT(tries);
00532 for (int32_t i=0; i<len; i++)
00533 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00534
00535 SG_FREE(vec);
00536 return normalizer->normalize_rhs(sum, idx);
00537 }
00538
00539 void CWeightedDegreeStringKernel::compute_by_tree(
00540 int32_t idx, float64_t* LevelContrib)
00541 {
00542 ASSERT(alphabet);
00543 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00544
00545 int32_t len ;
00546 bool free_vec;
00547 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00548
00549 int32_t *vec = SG_MALLOC(int32_t, len);
00550
00551 for (int32_t i=0; i<len; i++)
00552 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00553 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00554
00555 ASSERT(tries);
00556 for (int32_t i=0; i<len; i++)
00557 {
00558 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00559 normalizer->normalize_rhs(1.0, idx),
00560 mkl_stepsize, weights, (length!=0));
00561 }
00562
00563 SG_FREE(vec);
00564 }
00565
00566 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00567 {
00568 ASSERT(tries);
00569 return tries->compute_abs_weights(len);
00570 }
00571
00572 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00573 {
00574 ASSERT(degree>0);
00575 ASSERT(p_type==E_WD);
00576
00577 SG_FREE(weights);
00578 weights=SG_MALLOC(float64_t, degree);
00579 weights_degree=degree;
00580 weights_length=1;
00581
00582 if (weights)
00583 {
00584 int32_t i;
00585 float64_t sum=0;
00586 for (i=0; i<degree; i++)
00587 {
00588 weights[i]=degree-i;
00589 sum+=weights[i];
00590 }
00591 for (i=0; i<degree; i++)
00592 weights[i]/=sum;
00593
00594 for (i=0; i<degree; i++)
00595 {
00596 for (int32_t j=1; j<=max_mismatch; j++)
00597 {
00598 if (j<i+1)
00599 {
00600 int32_t nk=CMath::nchoosek(i+1, j);
00601 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00602 }
00603 else
00604 weights[i+j*degree]= 0;
00605 }
00606 }
00607
00608 if (which_degree>=0)
00609 {
00610 ASSERT(which_degree<degree);
00611 for (i=0; i<degree; i++)
00612 {
00613 if (i!=which_degree)
00614 weights[i]=0;
00615 else
00616 weights[i]=1;
00617 }
00618 }
00619 return true;
00620 }
00621 else
00622 return false;
00623 }
00624
00625 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights)
00626 {
00627 float64_t* ws=new_weights.matrix;
00628 int32_t d=new_weights.num_rows;
00629 int32_t len=new_weights.num_cols;
00630
00631 if (d!=degree || len<0)
00632 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00633
00634 degree=d;
00635 length=len;
00636
00637 if (len <= 0)
00638 len=1;
00639
00640 weights_degree=degree;
00641 weights_length=len+max_mismatch;
00642
00643
00644 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00645 int32_t num_weights=weights_degree*weights_length;
00646 SG_FREE(weights);
00647 weights=SG_MALLOC(float64_t, num_weights);
00648
00649 for (int32_t i=0; i<degree*len; i++)
00650 weights[i]=ws[i];
00651
00652 return true;
00653 }
00654
00655 bool CWeightedDegreeStringKernel::set_position_weights(
00656 float64_t* pws, int32_t len)
00657 {
00658 if (len==0)
00659 {
00660 SG_FREE(position_weights);
00661 position_weights=NULL;
00662 ASSERT(tries);
00663 tries->set_position_weights(position_weights);
00664 }
00665
00666 if (seq_length!=len)
00667 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00668
00669 SG_FREE(position_weights);
00670 position_weights=SG_MALLOC(float64_t, len);
00671 position_weights_len=len;
00672 ASSERT(tries);
00673 tries->set_position_weights(position_weights);
00674
00675 if (position_weights)
00676 {
00677 for (int32_t i=0; i<len; i++)
00678 position_weights[i]=pws[i];
00679 return true;
00680 }
00681 else
00682 return false;
00683 }
00684
00685 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00686 {
00687 SG_FREE(block_weights);
00688 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00689
00690 int32_t k;
00691 float64_t d=degree;
00692
00693 for (k=0; k<degree; k++)
00694 block_weights[k]=
00695 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00696 for (k=degree; k<seq_length; k++)
00697 block_weights[k]=(-d+3*k+4)/3;
00698
00699 return true;
00700 }
00701
00702 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00703 {
00704 ASSERT(weights);
00705 SG_FREE(block_weights);
00706 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00707
00708 int32_t i=0;
00709 block_weights[0]=weights[0];
00710 for (i=1; i<CMath::max(seq_length,degree); i++)
00711 block_weights[i]=0;
00712
00713 for (i=1; i<CMath::max(seq_length,degree); i++)
00714 {
00715 block_weights[i]=block_weights[i-1];
00716
00717 float64_t contrib=0;
00718 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00719 contrib+=weights[j];
00720
00721 block_weights[i]+=contrib;
00722 }
00723 return true;
00724 }
00725
00726 bool CWeightedDegreeStringKernel::init_block_weights_const()
00727 {
00728 SG_FREE(block_weights);
00729 block_weights=SG_MALLOC(float64_t, seq_length);
00730
00731 for (int32_t i=1; i<seq_length+1 ; i++)
00732 block_weights[i-1]=1.0/seq_length;
00733 return true;
00734 }
00735
00736 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00737 {
00738 SG_FREE(block_weights);
00739 block_weights=SG_MALLOC(float64_t, seq_length);
00740
00741 for (int32_t i=1; i<seq_length+1 ; i++)
00742 block_weights[i-1]=degree*i;
00743
00744 return true;
00745 }
00746
00747 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00748 {
00749 SG_FREE(block_weights);
00750 block_weights=SG_MALLOC(float64_t, seq_length);
00751
00752 for (int32_t i=1; i<degree+1 ; i++)
00753 block_weights[i-1]=((float64_t) i)*i;
00754
00755 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00756 block_weights[i-1]=i;
00757
00758 return true;
00759 }
00760
00761 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00762 {
00763 SG_FREE(block_weights);
00764 block_weights=SG_MALLOC(float64_t, seq_length);
00765
00766 for (int32_t i=1; i<degree+1 ; i++)
00767 block_weights[i-1]=((float64_t) i)*i*i;
00768
00769 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00770 block_weights[i-1]=i;
00771 return true;
00772 }
00773
00774 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00775 {
00776 SG_FREE(block_weights);
00777 block_weights=SG_MALLOC(float64_t, seq_length);
00778
00779 for (int32_t i=1; i<degree+1 ; i++)
00780 block_weights[i-1]=exp(((float64_t) i/10.0));
00781
00782 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00783 block_weights[i-1]=i;
00784
00785 return true;
00786 }
00787
00788 bool CWeightedDegreeStringKernel::init_block_weights_log()
00789 {
00790 SG_FREE(block_weights);
00791 block_weights=SG_MALLOC(float64_t, seq_length);
00792
00793 for (int32_t i=1; i<degree+1 ; i++)
00794 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00795
00796 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00797 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00798
00799 return true;
00800 }
00801
00802 bool CWeightedDegreeStringKernel::init_block_weights()
00803 {
00804 switch (type)
00805 {
00806 case E_WD:
00807 return init_block_weights_from_wd();
00808 case E_EXTERNAL:
00809 return init_block_weights_from_wd_external();
00810 case E_BLOCK_CONST:
00811 return init_block_weights_const();
00812 case E_BLOCK_LINEAR:
00813 return init_block_weights_linear();
00814 case E_BLOCK_SQPOLY:
00815 return init_block_weights_sqpoly();
00816 case E_BLOCK_CUBICPOLY:
00817 return init_block_weights_cubicpoly();
00818 case E_BLOCK_EXP:
00819 return init_block_weights_exp();
00820 case E_BLOCK_LOG:
00821 return init_block_weights_log();
00822 };
00823 return false;
00824 }
00825
00826
00827 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00828 {
00829 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00830 int32_t j=params->j;
00831 CWeightedDegreeStringKernel* wd=params->kernel;
00832 CTrie<DNATrie>* tries=params->tries;
00833 float64_t* weights=params->weights;
00834 int32_t length=params->length;
00835 int32_t* vec=params->vec;
00836 float64_t* result=params->result;
00837 float64_t factor=params->factor;
00838 int32_t* vec_idx=params->vec_idx;
00839
00840 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00841 CAlphabet* alpha=wd->alphabet;
00842
00843 for (int32_t i=params->start; i<params->end; i++)
00844 {
00845 int32_t len=0;
00846 bool free_vec;
00847 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00848 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00849 vec[k]=alpha->remap_to_bin(char_vec[k]);
00850 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00851
00852 ASSERT(tries);
00853
00854 result[i]+=factor*
00855 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00856 }
00857
00858 SG_UNREF(rhs_feat);
00859
00860 return NULL;
00861 }
00862
00863 void CWeightedDegreeStringKernel::compute_batch(
00864 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00865 int32_t* IDX, float64_t* alphas, float64_t factor)
00866 {
00867 ASSERT(tries);
00868 ASSERT(alphabet);
00869 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00870 ASSERT(rhs);
00871 ASSERT(num_vec<=rhs->get_num_vectors());
00872 ASSERT(num_vec>0);
00873 ASSERT(vec_idx);
00874 ASSERT(result);
00875 create_empty_tries();
00876
00877 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00878 ASSERT(num_feat>0);
00879 int32_t num_threads=parallel->get_num_threads();
00880 ASSERT(num_threads>0);
00881 int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
00882
00883 if (num_threads < 2)
00884 {
00885 #ifdef CYGWIN
00886 for (int32_t j=0; j<num_feat; j++)
00887 #else
00888 CSignal::clear_cancel();
00889 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00890 #endif
00891 {
00892 init_optimization(num_suppvec, IDX, alphas, j);
00893 S_THREAD_PARAM params;
00894 params.vec=vec;
00895 params.result=result;
00896 params.weights=weights;
00897 params.kernel=this;
00898 params.tries=tries;
00899 params.factor=factor;
00900 params.j=j;
00901 params.start=0;
00902 params.end=num_vec;
00903 params.length=length;
00904 params.vec_idx=vec_idx;
00905 compute_batch_helper((void*) ¶ms);
00906
00907 SG_PROGRESS(j,0,num_feat);
00908 }
00909 }
00910 #ifndef WIN32
00911 else
00912 {
00913 CSignal::clear_cancel();
00914 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00915 {
00916 init_optimization(num_suppvec, IDX, alphas, j);
00917 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00918 S_THREAD_PARAM* params = SG_MALLOC(S_THREAD_PARAM, num_threads);
00919 int32_t step= num_vec/num_threads;
00920 int32_t t;
00921
00922 for (t=0; t<num_threads-1; t++)
00923 {
00924 params[t].vec=&vec[num_feat*t];
00925 params[t].result=result;
00926 params[t].weights=weights;
00927 params[t].kernel=this;
00928 params[t].tries=tries;
00929 params[t].factor=factor;
00930 params[t].j=j;
00931 params[t].start = t*step;
00932 params[t].end = (t+1)*step;
00933 params[t].length=length;
00934 params[t].vec_idx=vec_idx;
00935 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]);
00936 }
00937 params[t].vec=&vec[num_feat*t];
00938 params[t].result=result;
00939 params[t].weights=weights;
00940 params[t].kernel=this;
00941 params[t].tries=tries;
00942 params[t].factor=factor;
00943 params[t].j=j;
00944 params[t].start=t*step;
00945 params[t].end=num_vec;
00946 params[t].length=length;
00947 params[t].vec_idx=vec_idx;
00948 compute_batch_helper((void*) ¶ms[t]);
00949
00950 for (t=0; t<num_threads-1; t++)
00951 pthread_join(threads[t], NULL);
00952 SG_PROGRESS(j,0,num_feat);
00953
00954 SG_FREE(params);
00955 SG_FREE(threads);
00956 }
00957 }
00958 #endif
00959
00960 SG_FREE(vec);
00961
00962
00963
00964 create_empty_tries();
00965 }
00966
00967 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00968 {
00969 if (type==E_EXTERNAL && max!=0)
00970 return false;
00971
00972 max_mismatch=max;
00973
00974 if (lhs!=NULL && rhs!=NULL)
00975 return init(lhs, rhs);
00976 else
00977 return true;
00978 }
00979
00980 void CWeightedDegreeStringKernel::init()
00981 {
00982 weights=NULL;
00983 weights_degree=0;
00984 weights_length=0;
00985
00986 position_weights=NULL;
00987 position_weights_len=0;
00988
00989 weights_buffer=NULL;
00990 mkl_stepsize=1;
00991 degree=1;
00992 length=0;
00993
00994 max_mismatch=0;
00995 seq_length=0;
00996
00997 block_weights=NULL;
00998 block_computation=true;
00999 type=E_WD;
01000 which_degree=-1;
01001 tries=NULL;
01002
01003 tree_initialized=false;
01004 alphabet=NULL;
01005
01006 lhs=NULL;
01007 rhs=NULL;
01008
01009 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01010
01011 set_normalizer(new CFirstElementKernelNormalizer());
01012
01013 m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01014 "weights", "WD Kernel weights.");
01015 m_parameters->add_vector(&position_weights, &position_weights_len,
01016 "position_weights",
01017 "Weights per position.");
01018 m_parameters->add(&mkl_stepsize, "mkl_stepsize", "MKL step size.");
01019 m_parameters->add(°ree, "degree", "Order of WD kernel.");
01020 m_parameters->add(&max_mismatch, "max_mismatch",
01021 "Number of allowed mismatches.");
01022 m_parameters->add(&block_computation, "block_computation",
01023 "If block computation shall be used.");
01024 m_parameters->add((machine_int_t*) &type, "type",
01025 "WeightedDegree kernel type.");
01026 m_parameters->add(&which_degree, "which_degree",
01027 "Unqueal -1 if just a single degree is selected.");
01028 m_parameters->add((CSGObject**) &alphabet, "alphabet",
01029 "Alphabet of Features.");
01030 }