00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/lib/common.h>
00012 #include <shogun/io/SGIO.h>
00013
00014 #include <shogun/base/Parameter.h>
00015
00016 #include <shogun/kernel/CommWordStringKernel.h>
00017 #include <shogun/kernel/SqrtDiagKernelNormalizer.h>
00018 #include <shogun/features/StringFeatures.h>
00019
00020 using namespace shogun;
00021
00022 CCommWordStringKernel::CCommWordStringKernel()
00023 : CStringKernel<uint16_t>()
00024 {
00025 init();
00026 }
00027
00028 CCommWordStringKernel::CCommWordStringKernel(int32_t size, bool s)
00029 : CStringKernel<uint16_t>(size)
00030 {
00031 init();
00032 use_sign=s;
00033 }
00034
00035 CCommWordStringKernel::CCommWordStringKernel(
00036 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r,
00037 bool s, int32_t size) : CStringKernel<uint16_t>(size)
00038 {
00039 init();
00040 use_sign=s;
00041
00042 init(l,r);
00043 }
00044
00045
00046 bool CCommWordStringKernel::init_dictionary(int32_t size)
00047 {
00048 dictionary_size= size;
00049 SG_FREE(dictionary_weights);
00050 dictionary_weights=SG_MALLOC(float64_t, size);
00051 SG_DEBUG( "using dictionary of %d words\n", size);
00052 clear_normal();
00053
00054 return dictionary_weights!=NULL;
00055 }
00056
00057 CCommWordStringKernel::~CCommWordStringKernel()
00058 {
00059 cleanup();
00060
00061 SG_FREE(dictionary_weights);
00062 SG_FREE(dict_diagonal_optimization);
00063 }
00064
00065 bool CCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00066 {
00067 CStringKernel<uint16_t>::init(l,r);
00068
00069 if (use_dict_diagonal_optimization)
00070 {
00071 SG_FREE(dict_diagonal_optimization);
00072 dict_diagonal_optimization=SG_MALLOC(int32_t, int32_t(((CStringFeatures<uint16_t>*)l)->get_num_symbols()));
00073 ASSERT(((CStringFeatures<uint16_t>*)l)->get_num_symbols() == ((CStringFeatures<uint16_t>*)r)->get_num_symbols()) ;
00074 }
00075
00076 return init_normalizer();
00077 }
00078
00079 void CCommWordStringKernel::cleanup()
00080 {
00081 delete_optimization();
00082 CKernel::cleanup();
00083 }
00084
00085 float64_t CCommWordStringKernel::compute_diag(int32_t idx_a)
00086 {
00087 int32_t alen;
00088 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00089 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00090
00091 bool free_av;
00092 uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
00093
00094 float64_t result=0.0 ;
00095 ASSERT(l==r);
00096 ASSERT(sizeof(uint16_t)<=sizeof(float64_t));
00097 ASSERT((1<<(sizeof(uint16_t)*8)) > alen);
00098
00099 int32_t num_symbols=(int32_t) l->get_num_symbols();
00100 ASSERT(num_symbols<=dictionary_size);
00101
00102 int32_t* dic = dict_diagonal_optimization;
00103 memset(dic, 0, num_symbols*sizeof(int32_t));
00104
00105 for (int32_t i=0; i<alen; i++)
00106 dic[av[i]]++;
00107
00108 if (use_sign)
00109 {
00110 for (int32_t i=0; i<(int32_t) l->get_num_symbols(); i++)
00111 {
00112 if (dic[i]!=0)
00113 result++;
00114 }
00115 }
00116 else
00117 {
00118 for (int32_t i=0; i<num_symbols; i++)
00119 {
00120 if (dic[i]!=0)
00121 result+=dic[i]*dic[i];
00122 }
00123 }
00124 l->free_feature_vector(av, idx_a, free_av);
00125
00126 return result;
00127 }
00128
00129 float64_t CCommWordStringKernel::compute_helper(
00130 int32_t idx_a, int32_t idx_b, bool do_sort)
00131 {
00132 int32_t alen, blen;
00133 bool free_av, free_bv;
00134
00135 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00136 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00137
00138 uint16_t* av=l->get_feature_vector(idx_a, alen, free_av);
00139 uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bv);
00140
00141 uint16_t* avec=av;
00142 uint16_t* bvec=bv;
00143
00144 if (do_sort)
00145 {
00146 if (alen>0)
00147 {
00148 avec=SG_MALLOC(uint16_t, alen);
00149 memcpy(avec, av, sizeof(uint16_t)*alen);
00150 CMath::radix_sort(avec, alen);
00151 }
00152 else
00153 avec=NULL;
00154
00155 if (blen>0)
00156 {
00157 bvec=SG_MALLOC(uint16_t, blen);
00158 memcpy(bvec, bv, sizeof(uint16_t)*blen);
00159 CMath::radix_sort(bvec, blen);
00160 }
00161 else
00162 bvec=NULL;
00163 }
00164 else
00165 {
00166 if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) ||
00167 (r->get_num_preprocessors() != r->get_num_preprocessed()))
00168 {
00169 SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00170 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(),
00171 r->get_num_preprocessed(), r->get_num_preprocessors());
00172 }
00173 }
00174
00175 float64_t result=0;
00176
00177 int32_t left_idx=0;
00178 int32_t right_idx=0;
00179
00180 if (use_sign)
00181 {
00182 while (left_idx < alen && right_idx < blen)
00183 {
00184 if (avec[left_idx]==bvec[right_idx])
00185 {
00186 uint16_t sym=avec[left_idx];
00187
00188 while (left_idx< alen && avec[left_idx]==sym)
00189 left_idx++;
00190
00191 while (right_idx< blen && bvec[right_idx]==sym)
00192 right_idx++;
00193
00194 result++;
00195 }
00196 else if (avec[left_idx]<bvec[right_idx])
00197 left_idx++;
00198 else
00199 right_idx++;
00200 }
00201 }
00202 else
00203 {
00204 while (left_idx < alen && right_idx < blen)
00205 {
00206 if (avec[left_idx]==bvec[right_idx])
00207 {
00208 int32_t old_left_idx=left_idx;
00209 int32_t old_right_idx=right_idx;
00210
00211 uint16_t sym=avec[left_idx];
00212
00213 while (left_idx< alen && avec[left_idx]==sym)
00214 left_idx++;
00215
00216 while (right_idx< blen && bvec[right_idx]==sym)
00217 right_idx++;
00218
00219 result+=((float64_t) (left_idx-old_left_idx))*
00220 ((float64_t) (right_idx-old_right_idx));
00221 }
00222 else if (avec[left_idx]<bvec[right_idx])
00223 left_idx++;
00224 else
00225 right_idx++;
00226 }
00227 }
00228
00229 if (do_sort)
00230 {
00231 SG_FREE(avec);
00232 SG_FREE(bvec);
00233 }
00234
00235 l->free_feature_vector(av, idx_a, free_av);
00236 r->free_feature_vector(bv, idx_b, free_bv);
00237
00238 return result;
00239 }
00240
00241 void CCommWordStringKernel::add_to_normal(int32_t vec_idx, float64_t weight)
00242 {
00243 int32_t len=-1;
00244 bool free_vec;
00245 uint16_t* vec=((CStringFeatures<uint16_t>*) lhs)->
00246 get_feature_vector(vec_idx, len, free_vec);
00247
00248 if (len>0)
00249 {
00250 int32_t j, last_j=0;
00251 if (use_sign)
00252 {
00253 for (j=1; j<len; j++)
00254 {
00255 if (vec[j]==vec[j-1])
00256 continue;
00257
00258 dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
00259 normalize_lhs(weight, vec_idx);
00260 }
00261
00262 dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
00263 normalize_lhs(weight, vec_idx);
00264 }
00265 else
00266 {
00267 for (j=1; j<len; j++)
00268 {
00269 if (vec[j]==vec[j-1])
00270 continue;
00271
00272 dictionary_weights[(int32_t) vec[j-1]]+=normalizer->
00273 normalize_lhs(weight*(j-last_j), vec_idx);
00274 last_j = j;
00275 }
00276
00277 dictionary_weights[(int32_t) vec[len-1]]+=normalizer->
00278 normalize_lhs(weight*(len-last_j), vec_idx);
00279 }
00280 set_is_initialized(true);
00281 }
00282
00283 ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec);
00284 }
00285
00286 void CCommWordStringKernel::clear_normal()
00287 {
00288 memset(dictionary_weights, 0, dictionary_size*sizeof(float64_t));
00289 set_is_initialized(false);
00290 }
00291
00292 bool CCommWordStringKernel::init_optimization(
00293 int32_t count, int32_t* IDX, float64_t* weights)
00294 {
00295 delete_optimization();
00296
00297 if (count<=0)
00298 {
00299 set_is_initialized(true);
00300 SG_DEBUG("empty set of SVs\n");
00301 return true;
00302 }
00303
00304 SG_DEBUG("initializing CCommWordStringKernel optimization\n");
00305
00306 for (int32_t i=0; i<count; i++)
00307 {
00308 if ( (i % (count/10+1)) == 0)
00309 SG_PROGRESS(i, 0, count);
00310
00311 add_to_normal(IDX[i], weights[i]);
00312 }
00313
00314 set_is_initialized(true);
00315 return true;
00316 }
00317
00318 bool CCommWordStringKernel::delete_optimization()
00319 {
00320 SG_DEBUG( "deleting CCommWordStringKernel optimization\n");
00321
00322 clear_normal();
00323 return true;
00324 }
00325
00326 float64_t CCommWordStringKernel::compute_optimized(int32_t i)
00327 {
00328 if (!get_is_initialized())
00329 {
00330 SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00331 return 0 ;
00332 }
00333
00334 float64_t result = 0;
00335 int32_t len = -1;
00336 bool free_vec;
00337 uint16_t* vec=((CStringFeatures<uint16_t>*) rhs)->
00338 get_feature_vector(i, len, free_vec);
00339
00340 int32_t j, last_j=0;
00341 if (vec && len>0)
00342 {
00343 if (use_sign)
00344 {
00345 for (j=1; j<len; j++)
00346 {
00347 if (vec[j]==vec[j-1])
00348 continue;
00349
00350 result += dictionary_weights[(int32_t) vec[j-1]];
00351 }
00352
00353 result += dictionary_weights[(int32_t) vec[len-1]];
00354 }
00355 else
00356 {
00357 for (j=1; j<len; j++)
00358 {
00359 if (vec[j]==vec[j-1])
00360 continue;
00361
00362 result += dictionary_weights[(int32_t) vec[j-1]]*(j-last_j);
00363 last_j = j;
00364 }
00365
00366 result += dictionary_weights[(int32_t) vec[len-1]]*(len-last_j);
00367 }
00368
00369 result=normalizer->normalize_rhs(result, i);
00370 }
00371 ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(vec, i, free_vec);
00372 return result;
00373 }
00374
00375 float64_t* CCommWordStringKernel::compute_scoring(
00376 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00377 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00378 {
00379 ASSERT(lhs);
00380 CStringFeatures<uint16_t>* str=((CStringFeatures<uint16_t>*) lhs);
00381 num_feat=1;
00382 CAlphabet* alpha=str->get_alphabet();
00383 ASSERT(alpha);
00384 int32_t num_bits=alpha->get_num_bits();
00385 int32_t order=str->get_order();
00386 ASSERT(max_degree<=order);
00387
00388 int32_t num_words=(int32_t) str->get_original_num_symbols();
00389 int32_t offset=0;
00390
00391 num_sym=0;
00392
00393 for (int32_t i=0; i<order; i++)
00394 num_sym+=CMath::pow((int32_t) num_words,i+1);
00395
00396 SG_DEBUG("num_words:%d, order:%d, len:%d sz:%d (len*sz:%d)\n", num_words, order,
00397 num_feat, num_sym, num_feat*num_sym);
00398
00399 if (!target)
00400 target=SG_MALLOC(float64_t, num_feat*num_sym);
00401 memset(target, 0, num_feat*num_sym*sizeof(float64_t));
00402
00403 if (do_init)
00404 init_optimization(num_suppvec, IDX, alphas);
00405
00406 uint32_t kmer_mask=0;
00407 uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
00408
00409 for (int32_t o=0; o<max_degree; o++)
00410 {
00411 float64_t* contrib=&target[offset];
00412 offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
00413
00414 kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
00415
00416 for (int32_t p=-o; p<order; p++)
00417 {
00418 int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
00419 uint32_t imer_mask=kmer_mask;
00420 uint32_t jmer_mask=kmer_mask;
00421
00422 if (p<0)
00423 {
00424 il=-p;
00425 m_sym=order-o-p-1;
00426 o_sym=-p;
00427 }
00428 else if (p<order-o)
00429 {
00430 ir=p;
00431 m_sym=order-o-1;
00432 }
00433 else
00434 {
00435 ir=p;
00436 m_sym=p;
00437 o_sym=p-order+o+1;
00438 jl=order-ir;
00439 imer_mask=(kmer_mask>>(num_bits*o_sym));
00440 jmer_mask=(kmer_mask>>(num_bits*jl));
00441 }
00442
00443 float64_t marginalizer=
00444 1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
00445
00446 for (uint32_t i=0; i<words; i++)
00447 {
00448 uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
00449
00450 if (p>=0 && p<order-o)
00451 {
00452
00453 #ifdef DEBUG_COMMSCORING
00454 SG_PRINT("o=%d/%d p=%d/%d i=0x%x x=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d marg=%g o_sym:%d m_sym:%d weight(",
00455 o,order, p,order, i, x, imer_mask, jmer_mask, kmer_mask, il, ir, marginalizer, o_sym, m_sym);
00456
00457 SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
00458 alpha->remap_to_char((x>>(3*num_bits))&0x03), alpha->remap_to_char((x>>(2*num_bits))&0x03),
00459 alpha->remap_to_char((x>>num_bits)&0x03), alpha->remap_to_char(x&0x03),
00460 alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
00461 alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
00462 dictionary_weights[i]*marginalizer, dictionary_weights[i]);
00463 #endif
00464 contrib[x]+=dictionary_weights[i]*marginalizer;
00465 }
00466 else
00467 {
00468 for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
00469 {
00470 uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
00471 #ifdef DEBUG_COMMSCORING
00472
00473 SG_PRINT("o=%d/%d p=%d/%d i=0x%x j=0x%x x=0x%x c=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d jl=%d marg=%g o_sym:%d m_sym:%d weight(",
00474 o,order, p,order, i, j, x, c, imer_mask, jmer_mask, kmer_mask, il, ir, jl, marginalizer, o_sym, m_sym);
00475 SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n",
00476 alpha->remap_to_char((c>>(3*num_bits))&0x03), alpha->remap_to_char((c>>(2*num_bits))&0x03),
00477 alpha->remap_to_char((c>>num_bits)&0x03), alpha->remap_to_char(c&0x03),
00478 alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
00479 alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
00480 dictionary_weights[i]*marginalizer, dictionary_weights[i]);
00481 #endif
00482 contrib[c]+=dictionary_weights[i]*marginalizer;
00483 }
00484 }
00485 }
00486 }
00487 }
00488
00489 for (int32_t i=1; i<num_feat; i++)
00490 memcpy(&target[num_sym*i], target, num_sym*sizeof(float64_t));
00491
00492 SG_UNREF(alpha);
00493
00494 return target;
00495 }
00496
00497
00498 char* CCommWordStringKernel::compute_consensus(
00499 int32_t &result_len, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
00500 {
00501 ASSERT(lhs);
00502 ASSERT(IDX);
00503 ASSERT(alphas);
00504
00505 CStringFeatures<uint16_t>* str=((CStringFeatures<uint16_t>*) lhs);
00506 int32_t num_words=(int32_t) str->get_num_symbols();
00507 int32_t num_feat=str->get_max_vector_length();
00508 int64_t total_len=((int64_t) num_feat) * num_words;
00509 CAlphabet* alpha=((CStringFeatures<uint16_t>*) lhs)->get_alphabet();
00510 ASSERT(alpha);
00511 int32_t num_bits=alpha->get_num_bits();
00512 int32_t order=str->get_order();
00513 int32_t max_idx=-1;
00514 float64_t max_score=0;
00515 result_len=num_feat+order-1;
00516
00517
00518 init_optimization(num_suppvec, IDX, alphas);
00519
00520 char* result=SG_MALLOC(char, result_len);
00521 int32_t* bt=SG_MALLOC(int32_t, total_len);
00522 float64_t* score=SG_MALLOC(float64_t, total_len);
00523
00524 for (int64_t i=0; i<total_len; i++)
00525 {
00526 bt[i]=-1;
00527 score[i]=0;
00528 }
00529
00530 for (int32_t t=0; t<num_words; t++)
00531 score[t]=dictionary_weights[t];
00532
00533
00534 for (int32_t i=1; i<num_feat; i++)
00535 {
00536 for (int32_t t1=0; t1<num_words; t1++)
00537 {
00538 max_idx=-1;
00539 max_score=0;
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550 uint16_t suffix=(uint16_t) t1 >> num_bits;
00551
00552 for (int32_t sym=0; sym<str->get_original_num_symbols(); sym++)
00553 {
00554 uint16_t t=suffix | sym << (num_bits*(order-1));
00555
00556
00557
00558
00559 float64_t sc=score[num_words*(i-1) + t]+dictionary_weights[t1];
00560 if (sc > max_score || max_idx==-1)
00561 {
00562 max_idx=t;
00563 max_score=sc;
00564 }
00565 }
00566 ASSERT(max_idx!=-1);
00567
00568 score[num_words*i + t1]=max_score;
00569 bt[num_words*i + t1]=max_idx;
00570 }
00571 }
00572
00573
00574 max_idx=0;
00575 max_score=score[num_words*(num_feat-1) + 0];
00576 for (int32_t t=1; t<num_words; t++)
00577 {
00578 float64_t sc=score[num_words*(num_feat-1) + t];
00579 if (sc>max_score)
00580 {
00581 max_idx=t;
00582 max_score=sc;
00583 }
00584 }
00585
00586 SG_PRINT("max_idx:%i, max_score:%f\n", max_idx, max_score);
00587
00588 for (int32_t i=result_len-1; i>=num_feat; i--)
00589 result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(result_len-1-i)), 1) );
00590
00591 for (int32_t i=num_feat-1; i>=0; i--)
00592 {
00593 result[i]=alpha->remap_to_char( (uint8_t) str->get_masked_symbols( (uint16_t) max_idx >> (num_bits*(order-1)), 1) );
00594 max_idx=bt[num_words*i + max_idx];
00595 }
00596
00597 SG_FREE(bt);
00598 SG_FREE(score);
00599 SG_UNREF(alpha);
00600 return result;
00601 }
00602
00603 void CCommWordStringKernel::init()
00604 {
00605 dictionary_size=0;
00606 dictionary_weights=NULL;
00607
00608 use_sign=false;
00609 use_dict_diagonal_optimization=false;
00610 dict_diagonal_optimization=NULL;
00611
00612 properties |= KP_LINADD;
00613 init_dictionary(1<<(sizeof(uint16_t)*8));
00614 set_normalizer(new CSqrtDiagKernelNormalizer(use_dict_diagonal_optimization));
00615
00616 m_parameters->add_vector(&dictionary_weights, &dictionary_size, "dictionary_weights",
00617 "Dictionary for applying kernel.");
00618 m_parameters->add(&use_sign, "use_sign",
00619 "If signum(counts) is used instead of counts.");
00620 m_parameters->add(&use_dict_diagonal_optimization, "use_dict_diagonal_optimization",
00621 "If K(x,x) is computed potentially more efficiently.");
00622 }