SalzbergWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/lib/common.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/kernel/SalzbergWordStringKernel.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/features/StringFeatures.h>
00016 #include <shogun/features/Labels.h>
00017 #include <shogun/classifier/PluginEstimate.h>
00018 
00019 using namespace shogun;
00020 
00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel()
00022 : CStringKernel<uint16_t>(0)
00023 {
00024     init();
00025 }
00026 
00027 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels)
00028 : CStringKernel<uint16_t>(size)
00029 {
00030     init();
00031     estimate=pie;
00032 
00033     if (labels)
00034         set_prior_probs_from_labels(labels);
00035 }
00036 
00037 CSalzbergWordStringKernel::CSalzbergWordStringKernel(
00038     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r,
00039     CPluginEstimate* pie, CLabels* labels)
00040 : CStringKernel<uint16_t>(10),estimate(pie)
00041 {
00042     init();
00043     estimate=pie;
00044 
00045     if (labels)
00046         set_prior_probs_from_labels(labels);
00047 
00048     init(l, r);
00049 }
00050 
00051 CSalzbergWordStringKernel::~CSalzbergWordStringKernel()
00052 {
00053     cleanup();
00054 }
00055 
00056 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
00057 {
00058     CStringKernel<uint16_t>::init(p_l,p_r);
00059     CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l;
00060     ASSERT(l);
00061     CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r;
00062     ASSERT(r);
00063 
00064     int32_t i;
00065     initialized=false;
00066 
00067     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00068         SG_FREE(sqrtdiag_rhs);
00069     sqrtdiag_rhs=NULL;
00070     SG_FREE(sqrtdiag_lhs);
00071     sqrtdiag_lhs=NULL;
00072     if (ld_mean_lhs!=ld_mean_rhs)
00073         SG_FREE(ld_mean_rhs);
00074     ld_mean_rhs=NULL;
00075     SG_FREE(ld_mean_lhs);
00076     ld_mean_lhs=NULL;
00077 
00078     sqrtdiag_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
00079     ld_mean_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
00080 
00081     for (i=0; i<l->get_num_vectors(); i++)
00082         sqrtdiag_lhs[i]=1;
00083 
00084     if (l==r)
00085     {
00086         sqrtdiag_rhs=sqrtdiag_lhs;
00087         ld_mean_rhs=ld_mean_lhs;
00088     }
00089     else
00090     {
00091         sqrtdiag_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
00092         for (i=0; i<r->get_num_vectors(); i++)
00093             sqrtdiag_rhs[i]=1;
00094 
00095         ld_mean_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
00096     }
00097 
00098     float64_t* l_ld_mean_lhs=ld_mean_lhs;
00099     float64_t* l_ld_mean_rhs=ld_mean_rhs;
00100 
00101     //from our knowledge first normalize variance to 1 and then norm=1 does the job
00102     if (!initialized)
00103     {
00104         int32_t num_vectors=l->get_num_vectors();
00105         num_symbols=(int32_t) l->get_num_symbols();
00106         int32_t llen=l->get_vector_length(0);
00107         int32_t rlen=r->get_vector_length(0);
00108         num_params=(int32_t) llen*l->get_num_symbols();
00109         int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
00110         if ((!estimate) || (!estimate->check_models()))
00111         {
00112             SG_ERROR( "no estimate available\n");
00113             return false ;
00114         } ;
00115         if (num_params2!=estimate->get_num_params())
00116         {
00117             SG_ERROR( "number of parameters of estimate and feature representation do not match\n");
00118             return false ;
00119         } ;
00120 
00121         SG_FREE(variance);
00122         SG_FREE(mean);
00123         mean=SG_MALLOC(float64_t, num_params);
00124         ASSERT(mean);
00125         variance=SG_MALLOC(float64_t, num_params);
00126         ASSERT(variance);
00127 
00128         for (i=0; i<num_params; i++)
00129         {
00130             mean[i]=0;
00131             variance[i]=0;
00132         }
00133 
00134 
00135         // compute mean
00136         for (i=0; i<num_vectors; i++)
00137         {
00138             int32_t len;
00139             bool free_vec;
00140             uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00141 
00142             for (int32_t j=0; j<len; j++)
00143             {
00144                 int32_t idx=compute_index(j, vec[j]);
00145                 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00146                 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00147                 float64_t value   = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00148 
00149                 mean[idx]   += value/num_vectors ;
00150             }
00151             l->free_feature_vector(vec, i, free_vec);
00152         }
00153 
00154         // compute variance
00155         for (i=0; i<num_vectors; i++)
00156         {
00157             int32_t len;
00158             bool free_vec;
00159             uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00160 
00161             for (int32_t j=0; j<len; j++)
00162             {
00163                 for (int32_t k=0; k<4; k++)
00164                 {
00165                     int32_t idx=compute_index(j, k);
00166                     if (k!=vec[j])
00167                         variance[idx]+=mean[idx]*mean[idx]/num_vectors;
00168                     else
00169                     {
00170                         float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00171                         float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00172                         float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00173 
00174                         variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
00175                     }
00176                 }
00177             }
00178             l->free_feature_vector(vec, i, free_vec);
00179         }
00180 
00181 
00182         // compute sum_i m_i^2/s_i^2
00183         sum_m2_s2=0 ;
00184         for (i=0; i<num_params; i++)
00185         {
00186             if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy
00187                 variance[i]=1 ;
00188 
00189             //fprintf(stderr, "%i: mean=%1.2e  std=%1.2e\n", i, mean[i], std[i]) ;
00190             sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
00191         } ;
00192     } 
00193 
00194     // compute sum of 
00195     //result -= feature*mean[a_idx]/variance[a_idx] ;
00196 
00197     for (i=0; i<l->get_num_vectors(); i++)
00198     {
00199         int32_t alen ;
00200         bool free_avec;
00201         uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
00202         float64_t  result=0 ;
00203         for (int32_t j=0; j<alen; j++)
00204         {
00205             int32_t a_idx = compute_index(j, avec[j]) ;
00206             float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
00207             float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
00208             float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00209 
00210             if (variance[a_idx]!=0)
00211                 result-=value*mean[a_idx]/variance[a_idx];
00212         }
00213         ld_mean_lhs[i]=result ;
00214 
00215         l->free_feature_vector(avec, i, free_avec);
00216     }
00217 
00218     if (ld_mean_lhs!=ld_mean_rhs)
00219     {
00220         // compute sum of 
00221         //result -= feature*mean[b_idx]/variance[b_idx] ;
00222         for (i=0; i<r->get_num_vectors(); i++)
00223         {
00224             int32_t alen;
00225             bool free_avec;
00226             uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
00227             float64_t  result=0;
00228 
00229             for (int32_t j=0; j<alen; j++)
00230             {
00231                 int32_t a_idx = compute_index(j, avec[j]) ;
00232                 float64_t theta_p=1/estimate->log_derivative_pos_obsolete(
00233                     avec[j], j) ;
00234                 float64_t theta_n=1/estimate->log_derivative_neg_obsolete(
00235                     avec[j], j) ;
00236                 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
00237 
00238                 result -= value*mean[a_idx]/variance[a_idx] ;
00239             }
00240 
00241             ld_mean_rhs[i]=result;
00242             r->free_feature_vector(avec, i, free_avec);
00243         }
00244     }
00245 
00246     //warning hacky
00247     //
00248     this->lhs=l;
00249     this->rhs=l;
00250     ld_mean_lhs = l_ld_mean_lhs ;
00251     ld_mean_rhs = l_ld_mean_lhs ;
00252 
00253     //compute normalize to 1 values
00254     for (i=0; i<lhs->get_num_vectors(); i++)
00255     {
00256         sqrtdiag_lhs[i]=sqrt(compute(i,i));
00257 
00258         //trap divide by zero exception
00259         if (sqrtdiag_lhs[i]==0)
00260             sqrtdiag_lhs[i]=1e-16;
00261     }
00262 
00263     // if lhs is different from rhs (train/test data)
00264     // compute also the normalization for rhs
00265     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00266     {
00267         this->lhs=r;
00268         this->rhs=r;
00269         ld_mean_lhs = l_ld_mean_rhs ;
00270         ld_mean_rhs = l_ld_mean_rhs ;
00271 
00272         //compute normalize to 1 values
00273         for (i=0; i<rhs->get_num_vectors(); i++)
00274         {
00275             sqrtdiag_rhs[i]=sqrt(compute(i,i));
00276 
00277             //trap divide by zero exception
00278             if (sqrtdiag_rhs[i]==0)
00279                 sqrtdiag_rhs[i]=1e-16;
00280         }
00281     }
00282 
00283     this->lhs=l;
00284     this->rhs=r;
00285     ld_mean_lhs = l_ld_mean_lhs ;
00286     ld_mean_rhs = l_ld_mean_rhs ;
00287 
00288     initialized = true ;
00289     return init_normalizer();
00290 }
00291 
00292 void CSalzbergWordStringKernel::cleanup()
00293 {
00294     SG_FREE(variance);
00295     variance=NULL;
00296 
00297     SG_FREE(mean);
00298     mean=NULL;
00299 
00300     if (sqrtdiag_lhs != sqrtdiag_rhs)
00301         SG_FREE(sqrtdiag_rhs);
00302     sqrtdiag_rhs=NULL;
00303 
00304     SG_FREE(sqrtdiag_lhs);
00305     sqrtdiag_lhs=NULL;
00306 
00307     if (ld_mean_lhs!=ld_mean_rhs)
00308         SG_FREE(ld_mean_rhs);
00309     ld_mean_rhs=NULL;
00310 
00311     SG_FREE(ld_mean_lhs);
00312     ld_mean_lhs=NULL;
00313 
00314     CKernel::cleanup();
00315 }
00316 
00317 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
00318 {
00319     int32_t alen, blen;
00320     bool free_avec, free_bvec;
00321     uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00322     uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00323     // can only deal with strings of same length
00324     ASSERT(alen==blen);
00325 
00326     float64_t result = sum_m2_s2 ; // does not contain 0-th element
00327 
00328     for (int32_t i=0; i<alen; i++)
00329     {
00330         if (avec[i]==bvec[i])
00331         {
00332             int32_t a_idx = compute_index(i, avec[i]) ;
00333 
00334             float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
00335             float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
00336             float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00337 
00338             result   += value*value/variance[a_idx] ;
00339         }
00340     }
00341     result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
00342 
00343     ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00344     ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00345 
00346     if (initialized)
00347         result /=  (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
00348 
00349     return result;
00350 }
00351 
00352 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels)
00353 {
00354     ASSERT(labels);
00355 
00356     int32_t num_pos=0, num_neg=0;
00357     for (int32_t i=0; i<labels->get_num_labels(); i++)
00358     {
00359         if (labels->get_int_label(i)==1)
00360             num_pos++;
00361         if (labels->get_int_label(i)==-1)
00362             num_neg++;
00363     }
00364 
00365     SG_INFO("priors: pos=%1.3f (%i)  neg=%1.3f (%i)\n",
00366         (float64_t) num_pos/(num_pos+num_neg), num_pos,
00367         (float64_t) num_neg/(num_pos+num_neg), num_neg);
00368 
00369     set_prior_probs(
00370         (float64_t)num_pos/(num_pos+num_neg),
00371         (float64_t)num_neg/(num_pos+num_neg));
00372 }
00373 
00374 void CSalzbergWordStringKernel::init()
00375 {
00376     estimate=NULL;
00377     mean=NULL;
00378     variance=NULL;
00379 
00380     sqrtdiag_lhs=NULL;
00381     sqrtdiag_rhs=NULL;
00382 
00383     ld_mean_lhs=NULL;
00384     ld_mean_rhs=NULL;
00385 
00386     num_params=0;
00387     num_symbols=0;
00388     sum_m2_s2=0;
00389     pos_prior=0.5;
00390 
00391     neg_prior=0.5;
00392     initialized=false;
00393 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation