00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/lib/common.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/kernel/SalzbergWordStringKernel.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/features/StringFeatures.h>
00016 #include <shogun/features/Labels.h>
00017 #include <shogun/classifier/PluginEstimate.h>
00018
00019 using namespace shogun;
00020
00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel()
00022 : CStringKernel<uint16_t>(0)
00023 {
00024 init();
00025 }
00026
00027 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels)
00028 : CStringKernel<uint16_t>(size)
00029 {
00030 init();
00031 estimate=pie;
00032
00033 if (labels)
00034 set_prior_probs_from_labels(labels);
00035 }
00036
00037 CSalzbergWordStringKernel::CSalzbergWordStringKernel(
00038 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r,
00039 CPluginEstimate* pie, CLabels* labels)
00040 : CStringKernel<uint16_t>(10),estimate(pie)
00041 {
00042 init();
00043 estimate=pie;
00044
00045 if (labels)
00046 set_prior_probs_from_labels(labels);
00047
00048 init(l, r);
00049 }
00050
00051 CSalzbergWordStringKernel::~CSalzbergWordStringKernel()
00052 {
00053 cleanup();
00054 }
00055
00056 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
00057 {
00058 CStringKernel<uint16_t>::init(p_l,p_r);
00059 CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l;
00060 ASSERT(l);
00061 CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r;
00062 ASSERT(r);
00063
00064 int32_t i;
00065 initialized=false;
00066
00067 if (sqrtdiag_lhs!=sqrtdiag_rhs)
00068 SG_FREE(sqrtdiag_rhs);
00069 sqrtdiag_rhs=NULL;
00070 SG_FREE(sqrtdiag_lhs);
00071 sqrtdiag_lhs=NULL;
00072 if (ld_mean_lhs!=ld_mean_rhs)
00073 SG_FREE(ld_mean_rhs);
00074 ld_mean_rhs=NULL;
00075 SG_FREE(ld_mean_lhs);
00076 ld_mean_lhs=NULL;
00077
00078 sqrtdiag_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
00079 ld_mean_lhs=SG_MALLOC(float64_t, l->get_num_vectors());
00080
00081 for (i=0; i<l->get_num_vectors(); i++)
00082 sqrtdiag_lhs[i]=1;
00083
00084 if (l==r)
00085 {
00086 sqrtdiag_rhs=sqrtdiag_lhs;
00087 ld_mean_rhs=ld_mean_lhs;
00088 }
00089 else
00090 {
00091 sqrtdiag_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
00092 for (i=0; i<r->get_num_vectors(); i++)
00093 sqrtdiag_rhs[i]=1;
00094
00095 ld_mean_rhs=SG_MALLOC(float64_t, r->get_num_vectors());
00096 }
00097
00098 float64_t* l_ld_mean_lhs=ld_mean_lhs;
00099 float64_t* l_ld_mean_rhs=ld_mean_rhs;
00100
00101
00102 if (!initialized)
00103 {
00104 int32_t num_vectors=l->get_num_vectors();
00105 num_symbols=(int32_t) l->get_num_symbols();
00106 int32_t llen=l->get_vector_length(0);
00107 int32_t rlen=r->get_vector_length(0);
00108 num_params=(int32_t) llen*l->get_num_symbols();
00109 int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
00110 if ((!estimate) || (!estimate->check_models()))
00111 {
00112 SG_ERROR( "no estimate available\n");
00113 return false ;
00114 } ;
00115 if (num_params2!=estimate->get_num_params())
00116 {
00117 SG_ERROR( "number of parameters of estimate and feature representation do not match\n");
00118 return false ;
00119 } ;
00120
00121 SG_FREE(variance);
00122 SG_FREE(mean);
00123 mean=SG_MALLOC(float64_t, num_params);
00124 ASSERT(mean);
00125 variance=SG_MALLOC(float64_t, num_params);
00126 ASSERT(variance);
00127
00128 for (i=0; i<num_params; i++)
00129 {
00130 mean[i]=0;
00131 variance[i]=0;
00132 }
00133
00134
00135
00136 for (i=0; i<num_vectors; i++)
00137 {
00138 int32_t len;
00139 bool free_vec;
00140 uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00141
00142 for (int32_t j=0; j<len; j++)
00143 {
00144 int32_t idx=compute_index(j, vec[j]);
00145 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00146 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00147 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00148
00149 mean[idx] += value/num_vectors ;
00150 }
00151 l->free_feature_vector(vec, i, free_vec);
00152 }
00153
00154
00155 for (i=0; i<num_vectors; i++)
00156 {
00157 int32_t len;
00158 bool free_vec;
00159 uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00160
00161 for (int32_t j=0; j<len; j++)
00162 {
00163 for (int32_t k=0; k<4; k++)
00164 {
00165 int32_t idx=compute_index(j, k);
00166 if (k!=vec[j])
00167 variance[idx]+=mean[idx]*mean[idx]/num_vectors;
00168 else
00169 {
00170 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00171 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00172 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00173
00174 variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
00175 }
00176 }
00177 }
00178 l->free_feature_vector(vec, i, free_vec);
00179 }
00180
00181
00182
00183 sum_m2_s2=0 ;
00184 for (i=0; i<num_params; i++)
00185 {
00186 if (variance[i]<1e-14)
00187 variance[i]=1 ;
00188
00189
00190 sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
00191 } ;
00192 }
00193
00194
00195
00196
00197 for (i=0; i<l->get_num_vectors(); i++)
00198 {
00199 int32_t alen ;
00200 bool free_avec;
00201 uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
00202 float64_t result=0 ;
00203 for (int32_t j=0; j<alen; j++)
00204 {
00205 int32_t a_idx = compute_index(j, avec[j]) ;
00206 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
00207 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
00208 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00209
00210 if (variance[a_idx]!=0)
00211 result-=value*mean[a_idx]/variance[a_idx];
00212 }
00213 ld_mean_lhs[i]=result ;
00214
00215 l->free_feature_vector(avec, i, free_avec);
00216 }
00217
00218 if (ld_mean_lhs!=ld_mean_rhs)
00219 {
00220
00221
00222 for (i=0; i<r->get_num_vectors(); i++)
00223 {
00224 int32_t alen;
00225 bool free_avec;
00226 uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
00227 float64_t result=0;
00228
00229 for (int32_t j=0; j<alen; j++)
00230 {
00231 int32_t a_idx = compute_index(j, avec[j]) ;
00232 float64_t theta_p=1/estimate->log_derivative_pos_obsolete(
00233 avec[j], j) ;
00234 float64_t theta_n=1/estimate->log_derivative_neg_obsolete(
00235 avec[j], j) ;
00236 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
00237
00238 result -= value*mean[a_idx]/variance[a_idx] ;
00239 }
00240
00241 ld_mean_rhs[i]=result;
00242 r->free_feature_vector(avec, i, free_avec);
00243 }
00244 }
00245
00246
00247
00248 this->lhs=l;
00249 this->rhs=l;
00250 ld_mean_lhs = l_ld_mean_lhs ;
00251 ld_mean_rhs = l_ld_mean_lhs ;
00252
00253
00254 for (i=0; i<lhs->get_num_vectors(); i++)
00255 {
00256 sqrtdiag_lhs[i]=sqrt(compute(i,i));
00257
00258
00259 if (sqrtdiag_lhs[i]==0)
00260 sqrtdiag_lhs[i]=1e-16;
00261 }
00262
00263
00264
00265 if (sqrtdiag_lhs!=sqrtdiag_rhs)
00266 {
00267 this->lhs=r;
00268 this->rhs=r;
00269 ld_mean_lhs = l_ld_mean_rhs ;
00270 ld_mean_rhs = l_ld_mean_rhs ;
00271
00272
00273 for (i=0; i<rhs->get_num_vectors(); i++)
00274 {
00275 sqrtdiag_rhs[i]=sqrt(compute(i,i));
00276
00277
00278 if (sqrtdiag_rhs[i]==0)
00279 sqrtdiag_rhs[i]=1e-16;
00280 }
00281 }
00282
00283 this->lhs=l;
00284 this->rhs=r;
00285 ld_mean_lhs = l_ld_mean_lhs ;
00286 ld_mean_rhs = l_ld_mean_rhs ;
00287
00288 initialized = true ;
00289 return init_normalizer();
00290 }
00291
00292 void CSalzbergWordStringKernel::cleanup()
00293 {
00294 SG_FREE(variance);
00295 variance=NULL;
00296
00297 SG_FREE(mean);
00298 mean=NULL;
00299
00300 if (sqrtdiag_lhs != sqrtdiag_rhs)
00301 SG_FREE(sqrtdiag_rhs);
00302 sqrtdiag_rhs=NULL;
00303
00304 SG_FREE(sqrtdiag_lhs);
00305 sqrtdiag_lhs=NULL;
00306
00307 if (ld_mean_lhs!=ld_mean_rhs)
00308 SG_FREE(ld_mean_rhs);
00309 ld_mean_rhs=NULL;
00310
00311 SG_FREE(ld_mean_lhs);
00312 ld_mean_lhs=NULL;
00313
00314 CKernel::cleanup();
00315 }
00316
00317 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
00318 {
00319 int32_t alen, blen;
00320 bool free_avec, free_bvec;
00321 uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00322 uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00323
00324 ASSERT(alen==blen);
00325
00326 float64_t result = sum_m2_s2 ;
00327
00328 for (int32_t i=0; i<alen; i++)
00329 {
00330 if (avec[i]==bvec[i])
00331 {
00332 int32_t a_idx = compute_index(i, avec[i]) ;
00333
00334 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
00335 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
00336 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00337
00338 result += value*value/variance[a_idx] ;
00339 }
00340 }
00341 result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
00342
00343 ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00344 ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00345
00346 if (initialized)
00347 result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
00348
00349 return result;
00350 }
00351
00352 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels)
00353 {
00354 ASSERT(labels);
00355
00356 int32_t num_pos=0, num_neg=0;
00357 for (int32_t i=0; i<labels->get_num_labels(); i++)
00358 {
00359 if (labels->get_int_label(i)==1)
00360 num_pos++;
00361 if (labels->get_int_label(i)==-1)
00362 num_neg++;
00363 }
00364
00365 SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n",
00366 (float64_t) num_pos/(num_pos+num_neg), num_pos,
00367 (float64_t) num_neg/(num_pos+num_neg), num_neg);
00368
00369 set_prior_probs(
00370 (float64_t)num_pos/(num_pos+num_neg),
00371 (float64_t)num_neg/(num_pos+num_neg));
00372 }
00373
00374 void CSalzbergWordStringKernel::init()
00375 {
00376 estimate=NULL;
00377 mean=NULL;
00378 variance=NULL;
00379
00380 sqrtdiag_lhs=NULL;
00381 sqrtdiag_rhs=NULL;
00382
00383 ld_mean_lhs=NULL;
00384 ld_mean_rhs=NULL;
00385
00386 num_params=0;
00387 num_symbols=0;
00388 sum_m2_s2=0;
00389 pos_prior=0.5;
00390
00391 neg_prior=0.5;
00392 initialized=false;
00393 }