WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/common.h>
00013 #include <shogun/io/SGIO.h>
00014 #include <shogun/lib/Signal.h>
00015 #include <shogun/lib/Trie.h>
00016 #include <shogun/base/Parameter.h>
00017 #include <shogun/base/Parallel.h>
00018 
00019 #include <shogun/kernel/WeightedDegreeStringKernel.h>
00020 #include <shogun/kernel/FirstElementKernelNormalizer.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/StringFeatures.h>
00023 
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027 
00028 using namespace shogun;
00029 
00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00031 struct S_THREAD_PARAM
00032 {
00033 
00034     int32_t* vec;
00035     float64_t* result;
00036     float64_t* weights;
00037     CWeightedDegreeStringKernel* kernel;
00038     CTrie<DNATrie>* tries;
00039     float64_t factor;
00040     int32_t j;
00041     int32_t start;
00042     int32_t end;
00043     int32_t length;
00044     int32_t* vec_idx;
00045 };
00046 #endif // DOXYGEN_SHOULD_SKIP_THIS
00047 
00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ()
00049 : CStringKernel<char>()
00050 {
00051     init();
00052 }
00053 
00054 
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056     int32_t d, EWDKernType t)
00057 : CStringKernel<char>()
00058 {
00059     init();
00060 
00061     degree=d;
00062     type=t;
00063 
00064     if (type!=E_EXTERNAL)
00065         set_wd_weights_by_type(type);
00066 }
00067 
00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00069     float64_t *w, int32_t d)
00070 : CStringKernel<char>(10)
00071 {
00072     init();
00073 
00074     type=E_EXTERNAL;
00075     degree=d;
00076 
00077     weights=SG_MALLOC(float64_t, degree*(1+max_mismatch));
00078     weights_degree=degree;
00079     weights_length=(1+max_mismatch);
00080 
00081     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00082         weights[i]=w[i];
00083 }
00084 
00085 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00086     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00087 : CStringKernel<char>(10)
00088 {
00089     init();
00090     degree=d;
00091     type=E_WD;
00092     set_wd_weights_by_type(type);
00093     set_normalizer(new CFirstElementKernelNormalizer());
00094     init(l, r);
00095 }
00096 
00097 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00098 {
00099     cleanup();
00100 
00101     SG_FREE(weights);
00102     weights=NULL;
00103     weights_degree=0;
00104     weights_length=0;
00105 
00106     SG_FREE(block_weights);
00107     block_weights=NULL;
00108 
00109     SG_FREE(position_weights);
00110     position_weights=NULL;
00111 
00112     SG_FREE(weights_buffer);
00113     weights_buffer=NULL;
00114 }
00115 
00116 
00117 void CWeightedDegreeStringKernel::remove_lhs()
00118 {
00119     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00120     delete_optimization();
00121 
00122     if (tries!=NULL)
00123         tries->destroy();
00124 
00125     CKernel::remove_lhs();
00126 }
00127 
00128 void CWeightedDegreeStringKernel::create_empty_tries()
00129 {
00130     ASSERT(lhs);
00131 
00132     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00133 
00134     if (tries!=NULL)
00135     {
00136         tries->destroy() ;
00137         tries->create(seq_length, max_mismatch==0) ;
00138     }
00139 }
00140 
00141 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00142 {
00143     int32_t lhs_changed=(lhs!=l);
00144     int32_t rhs_changed=(rhs!=r);
00145 
00146     CStringKernel<char>::init(l,r);
00147 
00148     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00149     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00150 
00151     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00152     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00153 
00154     int32_t len=sf_l->get_max_vector_length();
00155     if (lhs_changed && !sf_l->have_same_length(len))
00156         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00157 
00158     if (rhs_changed && !sf_r->have_same_length(len))
00159         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00160 
00161     SG_UNREF(alphabet);
00162     alphabet=sf_l->get_alphabet();
00163     CAlphabet* ralphabet=sf_r->get_alphabet();
00164 
00165     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00166         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00167 
00168     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00169     SG_UNREF(ralphabet);
00170 
00171     if (tries!=NULL) {
00172         tries->delete_trees(max_mismatch==0);
00173         SG_UNREF(tries);
00174     }
00175     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00176     create_empty_tries();
00177 
00178     init_block_weights();
00179 
00180     return init_normalizer();
00181 }
00182 
00183 void CWeightedDegreeStringKernel::cleanup()
00184 {
00185     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00186     delete_optimization();
00187 
00188     SG_FREE(block_weights);
00189     block_weights=NULL;
00190 
00191     if (tries!=NULL)
00192     {
00193         tries->destroy();
00194         SG_UNREF(tries);
00195         tries=NULL;
00196     }
00197 
00198     seq_length=0;
00199     tree_initialized = false;
00200 
00201     SG_UNREF(alphabet);
00202     alphabet=NULL;
00203 
00204     CKernel::cleanup();
00205 }
00206 
00207 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00208 {
00209     if (tree_num<0)
00210         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00211 
00212     delete_optimization();
00213 
00214     if (tree_num<0)
00215         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00216 
00217     for (int32_t i=0; i<count; i++)
00218     {
00219         if (tree_num<0)
00220         {
00221             if ( (i % (count/10+1)) == 0)
00222                 SG_PROGRESS(i, 0, count);
00223 
00224             if (max_mismatch==0)
00225                 add_example_to_tree(IDX[i], alphas[i]) ;
00226             else
00227                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00228 
00229             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00230         }
00231         else
00232         {
00233             if (max_mismatch==0)
00234                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00235             else
00236                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00237         }
00238     }
00239 
00240     if (tree_num<0)
00241         SG_DONE();
00242 
00243     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00244 
00245     set_is_initialized(true) ;
00246     return true ;
00247 }
00248 
00249 bool CWeightedDegreeStringKernel::delete_optimization()
00250 {
00251     if (get_is_initialized())
00252     {
00253         if (tries!=NULL)
00254             tries->delete_trees(max_mismatch==0);
00255         set_is_initialized(false);
00256         return true;
00257     }
00258 
00259     return false;
00260 }
00261 
00262 
00263 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00264     char* avec, int32_t alen, char* bvec, int32_t blen)
00265 {
00266     float64_t sum = 0.0;
00267 
00268     for (int32_t i=0; i<alen; i++)
00269     {
00270         float64_t sumi = 0.0;
00271         int32_t mismatches=0;
00272 
00273         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00274         {
00275             if (avec[i+j]!=bvec[i+j])
00276             {
00277                 mismatches++ ;
00278                 if (mismatches>max_mismatch)
00279                     break ;
00280             } ;
00281             sumi += weights[j+degree*mismatches];
00282         }
00283         if (position_weights!=NULL)
00284             sum+=position_weights[i]*sumi ;
00285         else
00286             sum+=sumi ;
00287     }
00288     return sum ;
00289 }
00290 
00291 float64_t CWeightedDegreeStringKernel::compute_using_block(
00292     char* avec, int32_t alen, char* bvec, int32_t blen)
00293 {
00294     ASSERT(alen==blen);
00295 
00296     float64_t sum=0;
00297     int32_t match_len=-1;
00298 
00299     for (int32_t i=0; i<alen; i++)
00300     {
00301         if (avec[i]==bvec[i])
00302             match_len++;
00303         else
00304         {
00305             if (match_len>=0)
00306                 sum+=block_weights[match_len];
00307             match_len=-1;
00308         }
00309     }
00310 
00311     if (match_len>=0)
00312         sum+=block_weights[match_len];
00313 
00314     return sum;
00315 }
00316 
00317 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00318     char* avec, int32_t alen, char* bvec, int32_t blen)
00319 {
00320     float64_t sum = 0.0;
00321 
00322     for (int32_t i=0; i<alen; i++)
00323     {
00324         float64_t sumi = 0.0;
00325 
00326         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00327         {
00328             if (avec[i+j]!=bvec[i+j])
00329                 break ;
00330             sumi += weights[j];
00331         }
00332         if (position_weights!=NULL)
00333             sum+=position_weights[i]*sumi ;
00334         else
00335             sum+=sumi ;
00336     }
00337     return sum ;
00338 }
00339 
00340 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00341     char* avec, int32_t alen, char* bvec, int32_t blen)
00342 {
00343     float64_t sum = 0.0;
00344 
00345     for (int32_t i=0; i<alen; i++)
00346     {
00347         float64_t sumi=0.0;
00348         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00349         {
00350             if (avec[i+j]!=bvec[i+j])
00351                 break;
00352             sumi += weights[i*degree+j];
00353         }
00354         if (position_weights!=NULL)
00355             sum += position_weights[i]*sumi ;
00356         else
00357             sum += sumi ;
00358     }
00359 
00360     return sum ;
00361 }
00362 
00363 
00364 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00365 {
00366     int32_t alen, blen;
00367     bool free_avec, free_bvec;
00368     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00369     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00370     float64_t result=0;
00371 
00372     if (max_mismatch==0 && length==0 && block_computation)
00373         result=compute_using_block(avec, alen, bvec, blen);
00374     else
00375     {
00376         if (max_mismatch>0)
00377             result=compute_with_mismatch(avec, alen, bvec, blen);
00378         else if (length==0)
00379             result=compute_without_mismatch(avec, alen, bvec, blen);
00380         else
00381             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00382     }
00383     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00384     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00385 
00386     return result;
00387 }
00388 
00389 
00390 void CWeightedDegreeStringKernel::add_example_to_tree(
00391     int32_t idx, float64_t alpha)
00392 {
00393     ASSERT(alphabet);
00394     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00395 
00396     int32_t len=0;
00397     bool free_vec;
00398     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00399     ASSERT(max_mismatch==0);
00400     int32_t *vec=SG_MALLOC(int32_t, len);
00401 
00402     for (int32_t i=0; i<len; i++)
00403         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00404     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00405 
00406     if (length == 0 || max_mismatch > 0)
00407     {
00408         for (int32_t i=0; i<len; i++)
00409         {
00410             float64_t alpha_pw=alpha;
00411             /*if (position_weights!=NULL)
00412               alpha_pw *= position_weights[i] ;*/
00413             if (alpha_pw==0.0)
00414                 continue;
00415             ASSERT(tries);
00416             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00417         }
00418     }
00419     else
00420     {
00421         for (int32_t i=0; i<len; i++)
00422         {
00423             float64_t alpha_pw=alpha;
00424             /*if (position_weights!=NULL)
00425               alpha_pw = alpha*position_weights[i] ;*/
00426             if (alpha_pw==0.0)
00427                 continue ;
00428             ASSERT(tries);
00429             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00430         }
00431     }
00432     SG_FREE(vec);
00433     tree_initialized=true ;
00434 }
00435 
00436 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00437     int32_t idx, float64_t alpha, int32_t tree_num)
00438 {
00439     ASSERT(alphabet);
00440     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00441 
00442     int32_t len;
00443     bool free_vec;
00444     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00445     ASSERT(max_mismatch==0);
00446     int32_t *vec = SG_MALLOC(int32_t, len);
00447 
00448     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00449         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00450     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00451 
00452 
00453     ASSERT(tries);
00454     if (alpha!=0.0)
00455         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00456 
00457     SG_FREE(vec);
00458     tree_initialized=true ;
00459 }
00460 
00461 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00462 {
00463     ASSERT(tries);
00464     ASSERT(alphabet);
00465     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00466 
00467     int32_t len ;
00468     bool free_vec;
00469     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00470 
00471     int32_t *vec = SG_MALLOC(int32_t, len);
00472 
00473     for (int32_t i=0; i<len; i++)
00474         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00475     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00476 
00477     for (int32_t i=0; i<len; i++)
00478     {
00479         if (alpha!=0.0)
00480             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00481     }
00482 
00483     SG_FREE(vec);
00484     tree_initialized=true ;
00485 }
00486 
00487 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00488     int32_t idx, float64_t alpha, int32_t tree_num)
00489 {
00490     ASSERT(tries);
00491     ASSERT(alphabet);
00492     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00493 
00494     int32_t len=0;
00495     bool free_vec;
00496     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00497     int32_t *vec=SG_MALLOC(int32_t, len);
00498 
00499     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00500         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00501     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00502 
00503     if (alpha!=0.0)
00504     {
00505         tries->add_example_to_tree_mismatch_recursion(
00506             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00507             0, 0, max_mismatch, weights);
00508     }
00509 
00510     SG_FREE(vec);
00511     tree_initialized=true;
00512 }
00513 
00514 
00515 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00516 {
00517     ASSERT(alphabet);
00518     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00519 
00520     int32_t len=0;
00521     bool free_vec;
00522     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00523     ASSERT(char_vec && len>0);
00524     int32_t *vec=SG_MALLOC(int32_t, len);
00525 
00526     for (int32_t i=0; i<len; i++)
00527         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00528     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00529 
00530     float64_t sum=0;
00531     ASSERT(tries);
00532     for (int32_t i=0; i<len; i++)
00533         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00534 
00535     SG_FREE(vec);
00536     return normalizer->normalize_rhs(sum, idx);
00537 }
00538 
00539 void CWeightedDegreeStringKernel::compute_by_tree(
00540     int32_t idx, float64_t* LevelContrib)
00541 {
00542     ASSERT(alphabet);
00543     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00544 
00545     int32_t len ;
00546     bool free_vec;
00547     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00548 
00549     int32_t *vec = SG_MALLOC(int32_t, len);
00550 
00551     for (int32_t i=0; i<len; i++)
00552         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00553     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00554 
00555     ASSERT(tries);
00556     for (int32_t i=0; i<len; i++)
00557     {
00558         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00559                 normalizer->normalize_rhs(1.0, idx),
00560                 mkl_stepsize, weights, (length!=0));
00561     }
00562 
00563     SG_FREE(vec);
00564 }
00565 
00566 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00567 {
00568     ASSERT(tries);
00569     return tries->compute_abs_weights(len);
00570 }
00571 
00572 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00573 {
00574     ASSERT(degree>0);
00575     ASSERT(p_type==E_WD); 
00576 
00577     SG_FREE(weights);
00578     weights=SG_MALLOC(float64_t, degree);
00579     weights_degree=degree;
00580     weights_length=1;
00581 
00582     if (weights)
00583     {
00584         int32_t i;
00585         float64_t sum=0;
00586         for (i=0; i<degree; i++)
00587         {
00588             weights[i]=degree-i;
00589             sum+=weights[i];
00590         }
00591         for (i=0; i<degree; i++)
00592             weights[i]/=sum;
00593 
00594         for (i=0; i<degree; i++)
00595         {
00596             for (int32_t j=1; j<=max_mismatch; j++)
00597             {
00598                 if (j<i+1)
00599                 {
00600                     int32_t nk=CMath::nchoosek(i+1, j);
00601                     weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00602                 }
00603                 else
00604                     weights[i+j*degree]= 0;
00605             }
00606         }
00607 
00608         if (which_degree>=0)
00609         {
00610             ASSERT(which_degree<degree);
00611             for (i=0; i<degree; i++)
00612             {
00613                 if (i!=which_degree)
00614                     weights[i]=0;
00615                 else
00616                     weights[i]=1;
00617             }
00618         }
00619         return true;
00620     }
00621     else
00622         return false;
00623 }
00624 
00625 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights)
00626 {
00627     float64_t* ws=new_weights.matrix;
00628     int32_t d=new_weights.num_rows;
00629     int32_t len=new_weights.num_cols;
00630 
00631     if (d!=degree || len<0)
00632         SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00633 
00634     degree=d;
00635     length=len;
00636 
00637     if (len <= 0)
00638         len=1;
00639 
00640     weights_degree=degree;
00641     weights_length=len+max_mismatch;
00642 
00643 
00644     SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length);
00645     int32_t num_weights=weights_degree*weights_length;
00646     SG_FREE(weights);
00647     weights=SG_MALLOC(float64_t, num_weights);
00648 
00649     for (int32_t i=0; i<degree*len; i++)
00650         weights[i]=ws[i];
00651 
00652     return true;
00653 }
00654 
00655 bool CWeightedDegreeStringKernel::set_position_weights(
00656     float64_t* pws, int32_t len)
00657 {
00658     if (len==0)
00659     {
00660         SG_FREE(position_weights);
00661         position_weights=NULL;
00662         ASSERT(tries);
00663         tries->set_position_weights(position_weights);
00664     }
00665 
00666     if (seq_length!=len)
00667         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00668 
00669     SG_FREE(position_weights);
00670     position_weights=SG_MALLOC(float64_t, len);
00671     position_weights_len=len;
00672     ASSERT(tries);
00673     tries->set_position_weights(position_weights);
00674 
00675     if (position_weights)
00676     {
00677         for (int32_t i=0; i<len; i++)
00678             position_weights[i]=pws[i];
00679         return true;
00680     }
00681     else
00682         return false;
00683 }
00684 
00685 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00686 {
00687     SG_FREE(block_weights);
00688     block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00689 
00690     int32_t k;
00691     float64_t d=degree; // use float to evade rounding errors below
00692 
00693     for (k=0; k<degree; k++)
00694         block_weights[k]=
00695             (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00696     for (k=degree; k<seq_length; k++)
00697         block_weights[k]=(-d+3*k+4)/3;
00698 
00699     return true;
00700 }
00701 
00702 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00703 {
00704     ASSERT(weights);
00705     SG_FREE(block_weights);
00706     block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree));
00707 
00708     int32_t i=0;
00709     block_weights[0]=weights[0];
00710     for (i=1; i<CMath::max(seq_length,degree); i++)
00711         block_weights[i]=0;
00712 
00713     for (i=1; i<CMath::max(seq_length,degree); i++)
00714     {
00715         block_weights[i]=block_weights[i-1];
00716 
00717         float64_t contrib=0;
00718         for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00719             contrib+=weights[j];
00720 
00721         block_weights[i]+=contrib;
00722     }
00723     return true;
00724 }
00725 
00726 bool CWeightedDegreeStringKernel::init_block_weights_const()
00727 {
00728     SG_FREE(block_weights);
00729     block_weights=SG_MALLOC(float64_t, seq_length);
00730 
00731     for (int32_t i=1; i<seq_length+1 ; i++)
00732         block_weights[i-1]=1.0/seq_length;
00733     return true;
00734 }
00735 
00736 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00737 {
00738     SG_FREE(block_weights);
00739     block_weights=SG_MALLOC(float64_t, seq_length);
00740 
00741     for (int32_t i=1; i<seq_length+1 ; i++)
00742         block_weights[i-1]=degree*i;
00743 
00744     return true;
00745 }
00746 
00747 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00748 {
00749     SG_FREE(block_weights);
00750     block_weights=SG_MALLOC(float64_t, seq_length);
00751 
00752     for (int32_t i=1; i<degree+1 ; i++)
00753         block_weights[i-1]=((float64_t) i)*i;
00754 
00755     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00756         block_weights[i-1]=i;
00757 
00758     return true;
00759 }
00760 
00761 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00762 {
00763     SG_FREE(block_weights);
00764     block_weights=SG_MALLOC(float64_t, seq_length);
00765 
00766     for (int32_t i=1; i<degree+1 ; i++)
00767         block_weights[i-1]=((float64_t) i)*i*i;
00768 
00769     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00770         block_weights[i-1]=i;
00771     return true;
00772 }
00773 
00774 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00775 {
00776     SG_FREE(block_weights);
00777     block_weights=SG_MALLOC(float64_t, seq_length);
00778 
00779     for (int32_t i=1; i<degree+1 ; i++)
00780         block_weights[i-1]=exp(((float64_t) i/10.0));
00781 
00782     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00783         block_weights[i-1]=i;
00784 
00785     return true;
00786 }
00787 
00788 bool CWeightedDegreeStringKernel::init_block_weights_log()
00789 {
00790     SG_FREE(block_weights);
00791     block_weights=SG_MALLOC(float64_t, seq_length);
00792 
00793     for (int32_t i=1; i<degree+1 ; i++)
00794         block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00795 
00796     for (int32_t i=degree+1; i<seq_length+1 ; i++)
00797         block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00798 
00799     return true;
00800 }
00801 
00802 bool CWeightedDegreeStringKernel::init_block_weights()
00803 {
00804     switch (type)
00805     {
00806         case E_WD:
00807             return init_block_weights_from_wd();
00808         case E_EXTERNAL:
00809             return init_block_weights_from_wd_external();
00810         case E_BLOCK_CONST:
00811             return init_block_weights_const();
00812         case E_BLOCK_LINEAR:
00813             return init_block_weights_linear();
00814         case E_BLOCK_SQPOLY:
00815             return init_block_weights_sqpoly();
00816         case E_BLOCK_CUBICPOLY:
00817             return init_block_weights_cubicpoly();
00818         case E_BLOCK_EXP:
00819             return init_block_weights_exp();
00820         case E_BLOCK_LOG:
00821             return init_block_weights_log();
00822     };
00823     return false;
00824 }
00825 
00826 
00827 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00828 {
00829     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00830     int32_t j=params->j;
00831     CWeightedDegreeStringKernel* wd=params->kernel;
00832     CTrie<DNATrie>* tries=params->tries;
00833     float64_t* weights=params->weights;
00834     int32_t length=params->length;
00835     int32_t* vec=params->vec;
00836     float64_t* result=params->result;
00837     float64_t factor=params->factor;
00838     int32_t* vec_idx=params->vec_idx;
00839 
00840     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00841     CAlphabet* alpha=wd->alphabet;
00842 
00843     for (int32_t i=params->start; i<params->end; i++)
00844     {
00845         int32_t len=0;
00846         bool free_vec;
00847         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00848         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00849             vec[k]=alpha->remap_to_bin(char_vec[k]);
00850         rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00851 
00852         ASSERT(tries);
00853 
00854         result[i]+=factor*
00855             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00856     }
00857 
00858     SG_UNREF(rhs_feat);
00859 
00860     return NULL;
00861 }
00862 
00863 void CWeightedDegreeStringKernel::compute_batch(
00864     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00865     int32_t* IDX, float64_t* alphas, float64_t factor)
00866 {
00867     ASSERT(tries);
00868     ASSERT(alphabet);
00869     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00870     ASSERT(rhs);
00871     ASSERT(num_vec<=rhs->get_num_vectors());
00872     ASSERT(num_vec>0);
00873     ASSERT(vec_idx);
00874     ASSERT(result);
00875     create_empty_tries();
00876 
00877     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00878     ASSERT(num_feat>0);
00879     int32_t num_threads=parallel->get_num_threads();
00880     ASSERT(num_threads>0);
00881     int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat);
00882 
00883     if (num_threads < 2)
00884     {
00885 #ifdef CYGWIN
00886         for (int32_t j=0; j<num_feat; j++)
00887 #else
00888         CSignal::clear_cancel();
00889         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00890 #endif
00891         {
00892             init_optimization(num_suppvec, IDX, alphas, j);
00893             S_THREAD_PARAM params;
00894             params.vec=vec;
00895             params.result=result;
00896             params.weights=weights;
00897             params.kernel=this;
00898             params.tries=tries;
00899             params.factor=factor;
00900             params.j=j;
00901             params.start=0;
00902             params.end=num_vec;
00903             params.length=length;
00904             params.vec_idx=vec_idx;
00905             compute_batch_helper((void*) &params);
00906 
00907             SG_PROGRESS(j,0,num_feat);
00908         }
00909     }
00910 #ifndef WIN32
00911     else
00912     {
00913         CSignal::clear_cancel();
00914         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00915         {
00916             init_optimization(num_suppvec, IDX, alphas, j);
00917             pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00918             S_THREAD_PARAM* params = SG_MALLOC(S_THREAD_PARAM, num_threads);
00919             int32_t step= num_vec/num_threads;
00920             int32_t t;
00921 
00922             for (t=0; t<num_threads-1; t++)
00923             {
00924                 params[t].vec=&vec[num_feat*t];
00925                 params[t].result=result;
00926                 params[t].weights=weights;
00927                 params[t].kernel=this;
00928                 params[t].tries=tries;
00929                 params[t].factor=factor;
00930                 params[t].j=j;
00931                 params[t].start = t*step;
00932                 params[t].end = (t+1)*step;
00933                 params[t].length=length;
00934                 params[t].vec_idx=vec_idx;
00935                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00936             }
00937             params[t].vec=&vec[num_feat*t];
00938             params[t].result=result;
00939             params[t].weights=weights;
00940             params[t].kernel=this;
00941             params[t].tries=tries;
00942             params[t].factor=factor;
00943             params[t].j=j;
00944             params[t].start=t*step;
00945             params[t].end=num_vec;
00946             params[t].length=length;
00947             params[t].vec_idx=vec_idx;
00948             compute_batch_helper((void*) &params[t]);
00949 
00950             for (t=0; t<num_threads-1; t++)
00951                 pthread_join(threads[t], NULL);
00952             SG_PROGRESS(j,0,num_feat);
00953 
00954             SG_FREE(params);
00955             SG_FREE(threads);
00956         }
00957     }
00958 #endif
00959 
00960     SG_FREE(vec);
00961 
00962     //really also free memory as this can be huge on testing especially when
00963     //using the combined kernel
00964     create_empty_tries();
00965 }
00966 
00967 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00968 {
00969     if (type==E_EXTERNAL && max!=0)
00970         return false;
00971 
00972     max_mismatch=max;
00973 
00974     if (lhs!=NULL && rhs!=NULL)
00975         return init(lhs, rhs);
00976     else
00977         return true;
00978 }
00979 
00980 void CWeightedDegreeStringKernel::init()
00981 {
00982     weights=NULL;
00983     weights_degree=0;
00984     weights_length=0;
00985 
00986     position_weights=NULL;
00987     position_weights_len=0;
00988 
00989     weights_buffer=NULL;
00990     mkl_stepsize=1;
00991     degree=1;
00992     length=0;
00993 
00994     max_mismatch=0;
00995     seq_length=0;
00996 
00997     block_weights=NULL;
00998     block_computation=true;
00999     type=E_WD;
01000     which_degree=-1;
01001     tries=NULL;
01002 
01003     tree_initialized=false;
01004     alphabet=NULL;
01005 
01006     lhs=NULL;
01007     rhs=NULL;
01008 
01009     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
01010 
01011     set_normalizer(new CFirstElementKernelNormalizer());
01012 
01013     m_parameters->add_matrix(&weights, &weights_degree, &weights_length,
01014             "weights", "WD Kernel weights.");
01015     m_parameters->add_vector(&position_weights, &position_weights_len,
01016             "position_weights",
01017             "Weights per position.");
01018     m_parameters->add(&mkl_stepsize, "mkl_stepsize", "MKL step size.");
01019     m_parameters->add(&degree, "degree", "Order of WD kernel.");
01020     m_parameters->add(&max_mismatch, "max_mismatch",
01021             "Number of allowed mismatches.");
01022     m_parameters->add(&block_computation, "block_computation",
01023             "If block computation shall be used.");
01024     m_parameters->add((machine_int_t*) &type, "type",
01025             "WeightedDegree kernel type.");
01026     m_parameters->add(&which_degree, "which_degree",
01027             "Unqueal -1 if just a single degree is selected.");
01028     m_parameters->add((CSGObject**) &alphabet, "alphabet",
01029             "Alphabet of Features.");
01030 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation