WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 #include <shogun/lib/Signal.h>
00015 #include <shogun/lib/Trie.h>
00016 #include <shogun/base/Parameter.h>
00017 #include <shogun/base/Parallel.h>
00018 
00019 #include <shogun/kernel/string/WeightedDegreeStringKernel.h>
00020 #include <shogun/kernel/normalizer/FirstElementKernelNormalizer.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023 
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027 
00028 using namespace shogun;
00029 
00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00031 struct S_THREAD_PARAM
00032 {
00033 
00034     int32_t* vec;
00035     float64_t* result;
00036     float64_t* weights;
00037     CWeightedDegreeStringKernel* kernel;
00038     CTrie<DNATrie>* tries;
00039     float64_t factor;
00040     int32_t j;
00041     int32_t start;
00042     int32_t end;
00043     int32_t length;
00044     int32_t* vec_idx;
00045 };
00046 #endif // DOXYGEN_SHOULD_SKIP_THIS
00047 
00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ()
00049 : CStringKernel<char>()
00050 {
00051     init();
00052 }
00053 
00054 
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056     int32_t d, EWDKernType t)
00057 : CStringKernel<char>()
00058 {
00059     init();
00060 
00061     degree=d;
00062     type=t;
00063 
00064     if (type!=E_EXTERNAL)
00065         set_wd_weights_by_type(type);
00066 }
00067 
00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(SGVector<float64_t> w)
00069 : CStringKernel<char>(10)
00070 {
00071     init();
00072 
00073     type=E_EXTERNAL;
00074     degree=w.vlen;
00075 
00076     weights=SG_MALLOC(float64_t, degree*(1+max_mismatch));
00077     weights_degree=degree;
00078     weights_length=(1+max_mismatch);
00079 
00080     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00081         weights[i]=w.vector[i];
00082 }
00083 
00084 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00085     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00086 : CStringKernel<char>(10)
00087 {
00088     init();
00089     degree=d;
00090     type=E_WD;
00091     set_wd_weights_by_type(type);
00092     set_normalizer(new CFirstElementKernelNormalizer());
00093     init(l, r);
00094 }
00095 
00096 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00097 {
00098     cleanup();
00099 
00100     SG_FREE(weights);
00101     weights=NULL;
00102     weights_degree=0;
00103     weights_length=0;
00104 
00105     SG_FREE(block_weights);
00106     block_weights=NULL;
00107 
00108     SG_FREE(position_weights);
00109     position_weights=NULL;
00110 
00111     SG_FREE(weights_buffer);
00112     weights_buffer=NULL;
00113 }
00114 
00115 
00116 void CWeightedDegreeStringKernel::remove_lhs()
00117 {
00118     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00119     delete_optimization();
00120 
00121     if (tries!=NULL)
00122         tries->destroy();
00123 
00124     CKernel::remove_lhs();
00125 }
00126 
00127 void CWeightedDegreeStringKernel::create_empty_tries()
00128 {
00129     ASSERT(lhs);
00130 
00131     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00132 
00133     if (tries!=NULL)
00134     {
00135         tries->destroy() ;
00136         tries->create(seq_length, max_mismatch==0) ;
00137     }
00138 }
00139 
00140 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00141 {
00142     int32_t lhs_changed=(lhs!=l);
00143     int32_t rhs_changed=(rhs!=r);
00144 
00145     CStringKernel<char>::init(l,r);
00146 
00147     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00148     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00149 
00150     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00151     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00152 
00153     int32_t len=sf_l->get_max_vector_length();
00154     if (lhs_changed && !sf_l->have_same_length(len))
00155         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00156 
00157     if (rhs_changed && !sf_r->have_same_length(len))
00158         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00159 
00160     SG_UNREF(alphabet);
00161     alphabet=sf_l->get_alphabet();
00162     CAlphabet* ralphabet=sf_r->get_alphabet();
00163 
00164     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00165         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00166 
00167     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00168     SG_UNREF(ralphabet);
00169 
00170     if (tries!=NULL) {
00171         tries->delete_trees(max_mismatch==0);
00172         SG_UNREF(tries);
00173     }
00174     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00175     create_empty_tries();
00176 
00177     init_block_weights();
00178 
00179     return init_normalizer();
00180 }
00181 
00182 void CWeightedDegreeStringKernel::cleanup()
00183 {
00184     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00185     delete_optimization();
00186 
00187     SG_FREE(block_weights);
00188     block_weights=NULL;
00189 
00190     if (tries!=NULL)
00191     {
00192         tries->destroy();
00193         SG_UNREF(tries);
00194         tries=NULL;
00195     }
00196 
00197     seq_length=0;
00198     tree_initialized = false;
00199 
00200     SG_UNREF(alphabet);
00201     alphabet=NULL;
00202 
00203     CKernel::cleanup();
00204 }
00205 
00206 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00207 {
00208     if (tree_num<0)
00209         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00210 
00211     delete_optimization();
00212 
00213     if (tree_num<0)
00214         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00215 
00216     for (int32_t i=0; i<count; i++)
00217     {
00218         if (tree_num<0)
00219         {
00220             if ( (i % (count/10+1)) == 0)
00221                 SG_PROGRESS(i, 0, count);
00222 
00223             if (max_mismatch==0)
00224                 add_example_to_tree(IDX[i], alphas[i]) ;
00225             else
00226                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00227 
00228             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00229         }
00230         else
00231         {
00232             if (max_mismatch==0)
00233                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00234             else
00235                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00236         }
00237     }
00238 
00239     if (tree_num<0)
00240         SG_DONE();
00241 
00242     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00243 
00244     set_is_initialized(true) ;
00245     return true ;
00246 }
00247 
00248 bool CWeightedDegreeStringKernel::delete_optimization()
00249 {
00250     if (get_is_initialized())
00251     {
00252         if (tries!=NULL)
00253             tries->delete_trees(max_mismatch==0);
00254         set_is_initialized(false);
00255         return true;
00256     }
00257 
00258     return false;
00259 }
00260 
00261 
00262 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00263     char* avec, int32_t alen, char* bvec, int32_t blen)
00264 {
00265     float64_t sum = 0.0;
00266 
00267     for (int32_t i=0; i<alen; i++)
00268     {
00269         float64_t sumi = 0.0;
00270         int32_t mismatches=0;
00271 
00272         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00273         {
00274             if (avec[i+j]!=bvec[i+j])
00275             {
00276                 mismatches++ ;
00277                 if (mismatches>max_mismatch)
00278                     break ;
00279             } ;
00280             sumi += weights[j+degree*mismatches];
00281         }
00282         if (position_weights!=NULL)
00283             sum+=position_weights[i]*sumi ;
00284         else
00285             sum+=sumi ;
00286     }
00287     return sum ;
00288 }
00289 
00290 float64_t CWeightedDegreeStringKernel::compute_using_block(
00291     char* avec, int32_t alen, char* bvec, int32_t blen)
00292 {
00293     ASSERT(alen==blen);
00294 
00295     float64_t sum=0;
00296     int32_t match_len=-1;
00297 
00298     for (int32_t i=0; i<alen; i++)
00299     {
00300         if (avec[i]==bvec[i])
00301             match_len++;
00302         else
00303         {
00304             if (match_len>=0)
00305                 sum+=block_weights[match_len];
00306             match_len=-1;
00307         }
00308     }
00309 
00310     if (match_len>=0)
00311         sum+=block_weights[match_len];
00312 
00313     return sum;
00314 }
00315 
00316 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00317     char* avec, int32_t alen, char* bvec, int32_t blen)
00318 {
00319     float64_t sum = 0.0;
00320 
00321     for (int32_t i=0; i<alen; i++)
00322     {
00323         float64_t sumi = 0.0;
00324 
00325         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00326         {
00327             if (avec[i+j]!=bvec[i+j])
00328                 break ;
00329             sumi += weights[j];
00330         }
00331         if (position_weights!=NULL)
00332             sum+=position_weights[i]*sumi ;
00333         else
00334             sum+=sumi ;
00335     }
00336     return sum ;
00337 }
00338 
00339 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00340     char* avec, int32_t alen, char* bvec, int32_t blen)
00341 {
00342     float64_t sum = 0.0;
00343 
00344     for (int32_t i=0; i<alen; i++)
00345     {
00346         float64_t sumi=0.0;
00347         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00348         {
00349             if (avec[i+j]!=bvec[i+j])
00350                 break;
00351             sumi += weights[i*degree+j];
00352         }
00353         if (position_weights!=NULL)
00354             sum += position_weights[i]*sumi ;
00355         else
00356             sum += sumi ;
00357     }
00358 
00359     return sum ;
00360 }
00361 
00362 
00363 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00364 {
00365     int32_t alen, blen;
00366     bool free_avec, free_bvec;
00367     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00368     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00369     float64_t result=0;
00370 
00371     if (max_mismatch==0 && length==0 && block_computation)
00372         result=compute_using_block(avec, alen, bvec, blen);
00373     else
00374     {
00375         if (max_mismatch>0)
00376             result=compute_with_mismatch(avec, alen, bvec, blen);
00377         else if (length==0)
00378             result=compute_without_mismatch(avec, alen, bvec, blen);
00379         else
00380             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00381     }
00382     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00383     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00384 
00385     return result;
00386 }
00387 
00388 
00389 void CWeightedDegreeStringKernel::add_example_to_tree(
00390     int32_t idx, float64_t alpha)
00391 {
00392     ASSERT(alphabet);
00393     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00394 
00395     int32_t len=0;
00396     bool free_vec;
00397     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00398     ASSERT(max_mismatch==0);
00399     int32_t *vec=SG_MALLOC(int32_t, len);
00400 
00401     for (int32_t i=0; i<len; i++)
00402         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00403     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00404 
00405     if (length == 0 || max_mismatch > 0)
00406     {
00407         for (int32_t i=0; i<len; i++)
00408         {
00409             float64_t alpha_pw=alpha;
00410             /*if (position_weights!=NULL)
00411               alpha_pw *= position_weights[i] ;*/
00412             if (alpha_pw==0.0)
00413                 continue;
00414             ASSERT(tries);
00415             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00416         }
00417     }
00418     else
00419     {
00420         for (int32_t i=0; i<len; i++)
00421         {
00422             float64_t alpha_pw=alpha;
00423             /*if (position_weights!=NULL)
00424               alpha_pw = alpha*position_weights[i] ;*/
00425             if (alpha_pw==0.0)
00426                 continue ;
00427             ASSERT(tries);
00428             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00429         }
00430     }
00431     SG_FREE(vec);
00432     tree_initialized=true ;
00433 }
00434 
00435 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00436     int32_t idx, float64_t alpha, int32_t tree_num)
00437 {
00438     ASSERT(alphabet);
00439     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00440 
00441     int32_t len;
00442     bool free_vec;
00443     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00444     ASSERT(max_mismatch==0);
00445     int32_t *vec = SG_MALLOC(int32_t, len);
00446 
00447     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00448         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00449     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00450 
00451 
00452     ASSERT(tries);
00453     if (alpha!=0.0)
00454         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00455 
00456     SG_FREE(vec);
00457     tree_initialized=true ;
00458 }
00459 
00460 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00461 {
00462     ASSERT(tries);
00463     ASSERT(alphabet);
00464     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00465 
00466     int32_t len ;
00467     bool free_vec;
00468     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00469 
00470     int32_t *vec = SG_MALLOC(int32_t, len);
00471 
00472     for (int32_t i=0; i<len; i++)
00473         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00474     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00475 
00476     for (int32_t i=0; i<len; i++)
00477     {
00478         if (alpha!=0.0)
00479             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00480     }
00481 
00482     SG_FREE(vec);
00483     tree_initialized=true ;
00484 }
00485 
00486 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00487     int32_t idx, float64_t alpha, int32_t tree_num)
00488 {
00489     ASSERT(tries);
00490     ASSERT(alphabet);
00491     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00492 
00493     int32_t len=0;
00494     bool free_vec;
00495     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00496     int32_t *vec=SG_MALLOC(int32_t, len);
00497 
00498     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00499         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00500     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00501 
00502     if (alpha!=0.0)
00503     {
00504         tries->add_example_to_tree_mismatch_recursion(
00505             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00506             0, 0, max_mismatch, weights);
00507     }
00508 
00509     SG_FREE(vec);
00510     tree_initialized=true;
00511 }
00512 
00513 
00514 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00515 {
00516     ASSERT(alphabet);
00517     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00518 
00519     int32_t len=0;
00520     bool free_vec;
00521     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00522     ASSERT(char_vec && len>0);
00523     int32_t *vec=SG_MALLOC(int32_t, len);
00524 
00525     for (int32_t i=0; i<len; i++)
00526         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00527     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00528 
00529     float64_t sum=0;
00530     ASSERT(tries);
00531     for (int32_t i=0; i<len; i++)
00532         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00533 
00534     SG_FREE(vec);
00535     return normalizer->normalize_rhs(sum, idx);
00536 }
00537 
00538 void CWeightedDegreeStringKernel::compute_by_tree(
00539     int32_t idx, float64_t* LevelContrib)
00540 {
00541     ASSERT(alphabet);
00542     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00543 
00544     int32_t len ;
00545     bool free_vec;
00546     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00547 
00548     int32_t *vec = SG_MALLOC(int32_t, len);
00549 
00550     for (int32_t i=0; i<len; i++)
00551         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00552     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00553 
00554     ASSERT(tries);
00555     for (int32_t i=0; i<len; i++)
00556     {
00557         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00558                 normalizer->normalize_rhs(1.0, idx),
00559                 mkl_stepsize, weights, (length!=0));
00560     }
00561 
00562     SG_FREE(vec);
00563 }
00564 
00565 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00566 {
00567     ASSERT(tries);
00568     return tries->compute_abs_weights(len);
00569 }
00570 
00571 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00572 {
00573     ASSERT(degree>0);
00574     ASSERT(p_type==E_WD); 
00575 
00576     SG_FREE(weights);
00577     weights=SG_MALLOC(float64_t, degree);
00578     weights_degree=degree;
00579     weights_length=1;
00580 
00581     if (weights)
00582     {
00583         int32_t i;
00584         float64_t sum=0;
00585         for (i=0; i<degree; i++)
00586         {
00587             weights[i]=degree-i;
00588             sum+=weights[i];
00589         }
00590         for (i=0; i<degree; i++)
00591             weights[i]/=sum;
00592 
00593         for (i=0; i<degree; i++)
00594         {
00595             for (int32_t j=1; j<=max_mismatch; j++)
00596             {
00597                 if (j<i+1)
00598                 {
00599                     int32_t nk=CMath::nchoosek(i+1, j);
00600                     weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00601                 }
00602                 else
00603                     weights[i+j*degree]= 0;
00604             }
00605         }
00606 
00607         if (which_degree>=0)
00608         {
00609             ASSERT(which_degree<degree);
00610             for (i=0; i<degree; i++)
00611             {
00612                 if (i!=which_degree)
00613                     weights[i]=0;
00614                 else
00615                     weights[i]=1;
00616             }
00617         }
00618         return true;
00619     }
00620     else
00621         return false;
00622 }
00623 
00624 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights)
00625 {
00626     float64_t* ws=new_weights.matrix;
00627     int32_t d=new_weights.num_rows;
00628     int32_t len=new_weights.num_cols;
00629 
00630     if (d!=degree || len<0)
00631         SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00632 
00633     degree=d;
00634     length=len;
00635 
00636     if (len <= 0)
00637         len=1;
00638 
00639     weights_degree=degree;
00640     weights_length=len+max_mismatch;
00641 
00642 
00643     SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00644     int32_t num_weights=weights_degree*weights_length;
00645     SG_FREE(weights);
00646     weights=SG_MALLOC(float64_t, num_weights);
00647 
00648     for (int32_t i=0; i<degree*len; i++)
00649         weights[i]=ws[i];
00650 
00651     return true;
00652 }
00653 
00654 bool CWeightedDegreeStringKernel::set_position_weights(
00655     float64_t* pws, int32_t len)
00656 {
00657     if (len==0)
00658     {
00659         SG_FREE(position_weights);
00660         position_weights=NULL;
00661         ASSERT(tries);
00662         tries->set_position_weights(position_weights);
00663     }
00664 
00665     if (seq_length!=len)
00666         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00667 
00668     SG_FREE(position_weights);
00669     position_weights=SG_MALLOC(float64_t, len);
00670     position_weights_len=len;
00671     ASSERT(tries);
00672     tries->set_position_weights(position_weights);
00673 
00674     if (position_weights)
00675     {
00676         for (int32_t i=0; i<len; i++)
00677             position_weights[i]=pws[i];
00678         return true;
00679     }
00680     else
00681         return false;
00682 }
00683 
00684 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00685 {
00686     SG_FREE(block_weights);
00687     block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00688 
00689     int32_t k;
00690     float64_t d=degree; // use float to evade rounding errors below
00691 
00692     for (k=0; k<degree; k++)
00693         block_weights[k]=
00694             (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00695     for (k=degree; k<seq_length; k++)
00696         block_weights[k]=(-d+3*k+4)/3;
00697 
00698     return true;
00699 }
00700 
00701 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00702 {
00703     ASSERT(weights);
00704     SG_FREE(block_weights);
00705     block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00706 
00707     int32_t i=0;
00708     block_weights[0]=weights[0];
00709     for (i=1; i<CMath::max(seq_length,degree); i++)
00710         block_weights[i]=0;
00711 
00712     for (i=1; i<CMath::max(seq_length,degree); i++)
00713     {
00714         block_weights[i]=block_weights[i-1];
00715 
00716         float64_t contrib=0;
00717         for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00718             contrib+=weights[j];
00719 
00720         block_weights[i]+=contrib;
00721     }
00722     return true;
00723 }
00724 
00725 bool CWeightedDegreeStringKernel::init_block_weights_const()
00726 {
00727     SG_FREE(block_weights);
00728     block_weights=SG_MALLOC(float64_t, seq_length);
00729 
00730     for (int32_t i=1; i<seq_length+1 ; i++)
00731         block_weights[i-1]=1.0/seq_length;
00732     return true;
00733 }
00734 
00735 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00736 {
00737     SG_FREE(block_weights);
00738     block_weights=SG_MALLOC(float64_t, seq_length);
00739 
00740     for (int32_t i=1; i<seq_length+1 ; i++)
00741         block_weights[i-1]=degree*i;
00742 
00743     return true;
00744 }
00745 
00746 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00747 {
00748     SG_FREE(block_weights);
00749     block_weights=SG_MALLOC(float64_t, seq_length);
00750 
00751     for (int32_t i=1; i<degree+1 ; i++)
00752         block_weights[i-1]=((float64_t) i)*i;
00753 
00754     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00755         block_weights[i-1]=i;
00756 
00757     return true;
00758 }
00759 
00760 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00761 {
00762     SG_FREE(block_weights);
00763     block_weights=SG_MALLOC(float64_t, seq_length);
00764 
00765     for (int32_t i=1; i<degree+1 ; i++)
00766         block_weights[i-1]=((float64_t) i)*i*i;
00767 
00768     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00769         block_weights[i-1]=i;
00770     return true;
00771 }
00772 
00773 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00774 {
00775     SG_FREE(block_weights);
00776     block_weights=SG_MALLOC(float64_t, seq_length);
00777 
00778     for (int32_t i=1; i<degree+1 ; i++)
00779         block_weights[i-1]=exp(((float64_t) i/10.0));
00780 
00781     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00782         block_weights[i-1]=i;
00783 
00784     return true;
00785 }
00786 
00787 bool CWeightedDegreeStringKernel::init_block_weights_log()
00788 {
00789     SG_FREE(block_weights);
00790     block_weights=SG_MALLOC(float64_t, seq_length);
00791 
00792     for (int32_t i=1; i<degree+1 ; i++)
00793         block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00794 
00795     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00796         block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00797 
00798     return true;
00799 }
00800 
00801 bool CWeightedDegreeStringKernel::init_block_weights()
00802 {
00803     switch (type)
00804     {
00805         case E_WD:
00806             return init_block_weights_from_wd();
00807         case E_EXTERNAL:
00808             return init_block_weights_from_wd_external();
00809         case E_BLOCK_CONST:
00810             return init_block_weights_const();
00811         case E_BLOCK_LINEAR:
00812             return init_block_weights_linear();
00813         case E_BLOCK_SQPOLY:
00814             return init_block_weights_sqpoly();
00815         case E_BLOCK_CUBICPOLY:
00816             return init_block_weights_cubicpoly();
00817         case E_BLOCK_EXP:
00818             return init_block_weights_exp();
00819         case E_BLOCK_LOG:
00820             return init_block_weights_log();
00821     };
00822     return false;
00823 }
00824 
00825 
00826 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00827 {
00828     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00829     int32_t j=params->j;
00830     CWeightedDegreeStringKernel* wd=params->kernel;
00831     CTrie<DNATrie>* tries=params->tries;
00832     float64_t* weights=params->weights;
00833     int32_t length=params->length;
00834     int32_t* vec=params->vec;
00835     float64_t* result=params->result;
00836     float64_t factor=params->factor;
00837     int32_t* vec_idx=params->vec_idx;
00838 
00839     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00840     CAlphabet* alpha=wd->alphabet;
00841 
00842     for (int32_t i=params->start; i<params->end; i++)
00843     {
00844         int32_t len=0;
00845         bool free_vec;
00846         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00847         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00848             vec[k]=alpha->remap_to_bin(char_vec[k]);
00849         rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00850 
00851         ASSERT(tries);
00852 
00853         result[i]+=factor*
00854             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00855     }
00856 
00857     SG_UNREF(rhs_feat);
00858 
00859     return NULL;
00860 }
00861 
00862 void CWeightedDegreeStringKernel::compute_batch(
00863     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00864     int32_t* IDX, float64_t* alphas, float64_t factor)
00865 {
00866     ASSERT(tries);
00867     ASSERT(alphabet);
00868     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00869     ASSERT(rhs);
00870     ASSERT(num_vec<=rhs->get_num_vectors());
00871     ASSERT(num_vec>0);
00872     ASSERT(vec_idx);
00873     ASSERT(result);
00874     create_empty_tries();
00875 
00876     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00877     ASSERT(num_feat>0);
00878     int32_t num_threads=parallel->get_num_threads();
00879     ASSERT(num_threads>0);
00880     int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
00881 
00882     if (num_threads < 2)
00883     {
00884 #ifdef CYGWIN
00885         for (int32_t j=0; j<num_feat; j++)
00886 #else
00887         CSignal::clear_cancel();
00888         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00889 #endif
00890         {
00891             init_optimization(num_suppvec, IDX, alphas, j);
00892             S_THREAD_PARAM params;
00893             params.vec=vec;
00894             params.result=result;
00895             params.weights=weights;
00896             params.kernel=this;
00897             params.tries=tries;
00898             params.factor=factor;
00899             params.j=j;
00900             params.start=0;
00901             params.end=num_vec;
00902             params.length=length;
00903             params.vec_idx=vec_idx;
00904             compute_batch_helper((void*) &params);
00905 
00906             SG_PROGRESS(j,0,num_feat);
00907         }
00908     }
00909 #ifdef HAVE_PTHREAD
00910     else
00911     {
00912         CSignal::clear_cancel();
00913         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00914         {
00915             init_optimization(num_suppvec, IDX, alphas, j);
00916             pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00917             S_THREAD_PARAM* params = SG_MALLOC(S_THREAD_PARAM, num_threads);
00918             int32_t step= num_vec/num_threads;
00919             int32_t t;
00920 
00921             for (t=0; t<num_threads-1; t++)
00922             {
00923                 params[t].vec=&vec[num_feat*t];
00924                 params[t].result=result;
00925                 params[t].weights=weights;
00926                 params[t].kernel=this;
00927                 params[t].tries=tries;
00928                 params[t].factor=factor;
00929                 params[t].j=j;
00930                 params[t].start = t*step;
00931                 params[t].end = (t+1)*step;
00932                 params[t].length=length;
00933                 params[t].vec_idx=vec_idx;
00934                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00935             }
00936             params[t].vec=&vec[num_feat*t];
00937             params[t].result=result;
00938             params[t].weights=weights;
00939             params[t].kernel=this;
00940             params[t].tries=tries;
00941             params[t].factor=factor;
00942             params[t].j=j;
00943             params[t].start=t*step;
00944             params[t].end=num_vec;
00945             params[t].length=length;
00946             params[t].vec_idx=vec_idx;
00947             compute_batch_helper((void*) &params[t]);
00948 
00949             for (t=0; t<num_threads-1; t++)
00950                 pthread_join(threads[t], NULL);
00951             SG_PROGRESS(j,0,num_feat);
00952 
00953             SG_FREE(params);
00954             SG_FREE(threads);
00955         }
00956     }
00957 #endif
00958 
00959     SG_FREE(vec);
00960 
00961     //really also free memory as this can be huge on testing especially when
00962     //using the combined kernel
00963     create_empty_tries();
00964 }
00965 
00966 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00967 {
00968     if (type==E_EXTERNAL && max!=0)
00969         return false;
00970 
00971     max_mismatch=max;
00972 
00973     if (lhs!=NULL && rhs!=NULL)
00974         return init(lhs, rhs);
00975     else
00976         return true;
00977 }
00978 
00979 void CWeightedDegreeStringKernel::init()
00980 {
00981     weights=NULL;
00982     weights_degree=0;
00983     weights_length=0;
00984 
00985     position_weights=NULL;
00986     position_weights_len=0;
00987 
00988     weights_buffer=NULL;
00989     mkl_stepsize=1;
00990     degree=1;
00991     length=0;
00992 
00993     max_mismatch=0;
00994     seq_length=0;
00995 
00996     block_weights=NULL;
00997     block_computation=true;
00998     type=E_WD;
00999     which_degree=-1;
01000     tries=NULL;
01001 
01002     tree_initialized=false;
01003     alphabet=NULL;
01004 
01005     lhs=NULL;
01006     rhs=NULL;
01007 
01008     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01009 
01010     set_normalizer(new CFirstElementKernelNormalizer());
01011 
01012     m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01013             "weights", "WD Kernel weights.");
01014     m_parameters->add_vector(&position_weights, &position_weights_len,
01015             "position_weights",
01016             "Weights per position.");
01017     SG_ADD(&mkl_stepsize, "mkl_stepsize", "MKL step size.", MS_AVAILABLE);
01018     SG_ADD(&degree, "degree", "Order of WD kernel.", MS_AVAILABLE);
01019     SG_ADD(&max_mismatch, "max_mismatch",
01020             "Number of allowed mismatches.", MS_AVAILABLE);
01021     SG_ADD(&block_computation, "block_computation",
01022             "If block computation shall be used.", MS_NOT_AVAILABLE);
01023     SG_ADD((machine_int_t*) &type, "type",
01024             "WeightedDegree kernel type.", MS_AVAILABLE);
01025     SG_ADD(&which_degree, "which_degree",
01026             "The selected degree. All degrees are used by default (for value -1).",
01027             MS_AVAILABLE);
01028     SG_ADD((CSGObject**) &alphabet, "alphabet",
01029             "Alphabet of Features.", MS_NOT_AVAILABLE);
01030 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation