00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/statistics/LinearTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/features/streaming/StreamingFeatures.h>
00013 #include <shogun/mathematics/Statistics.h>
00014 #include <shogun/features/CombinedFeatures.h>
00015 #include <shogun/kernel/CombinedKernel.h>
00016
00017 #include <shogun/lib/external/libqp.h>
00018
00019 using namespace shogun;
00020
00021 CLinearTimeMMD::CLinearTimeMMD() :
00022 CKernelTwoSampleTestStatistic()
00023 {
00024 init();
00025 }
00026
00027 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p,
00028 CStreamingFeatures* q, index_t m, index_t blocksize) :
00029 CKernelTwoSampleTestStatistic(kernel, NULL, m)
00030 {
00031 init();
00032
00033 m_streaming_p=p;
00034 SG_REF(m_streaming_p);
00035
00036 m_streaming_q=q;
00037 SG_REF(m_streaming_q);
00038
00039 m_blocksize=blocksize;
00040 }
00041
00042 CLinearTimeMMD::~CLinearTimeMMD()
00043 {
00044 SG_UNREF(m_streaming_p);
00045 SG_UNREF(m_streaming_q);
00046
00047
00048 }
00049
00050 void CLinearTimeMMD::init()
00051 {
00052 #ifdef HAVE_LAPACK
00053 SG_ADD(&m_opt_max_iterations, "opt_max_iterations", "Maximum number of "
00054 "iterations for qp solver", MS_NOT_AVAILABLE);
00055 SG_ADD(&m_opt_epsilon, "opt_epsilon", "Stopping criterion for qp solver",
00056 MS_NOT_AVAILABLE);
00057 SG_ADD(&m_opt_low_cut, "opt_low_cut", "Low cut value for optimization "
00058 "kernel weights", MS_NOT_AVAILABLE);
00059 SG_ADD(&m_opt_regularization_eps, "opt_regularization_eps", "Regularization"
00060 " value that is added to diagonal of Q matrix", MS_NOT_AVAILABLE);
00061
00062 m_opt_max_iterations=10000;
00063 m_opt_epsilon=10E-15;
00064 m_opt_low_cut=10E-7;
00065 m_opt_regularization_eps=0;
00066 #endif //HAVE_LAPACK
00067
00068 m_streaming_p=NULL;
00069 m_streaming_q=NULL;
00070 m_blocksize=10000;
00071
00072 SG_WARNING("%s::init(): register params!\n", get_name());
00073 }
00074
00075 void CLinearTimeMMD::compute_statistic_and_variance(
00076 float64_t& statistic, float64_t& variance)
00077 {
00078 SG_DEBUG("entering CLinearTimeMMD::compute_statistic_and_variance()\n");
00079
00080 REQUIRE(m_streaming_p, "%s::compute_statistic_and_variance: streaming "
00081 "features p required!\n", get_name());
00082 REQUIRE(m_streaming_q, "%s::compute_statistic_and_variance: streaming "
00083 "features q required!\n", get_name());
00084
00085 REQUIRE(m_kernel, "%s::compute_statistic_and_variance: kernel needed!\n",
00086 get_name());
00087
00088
00089
00090 index_t m_2=m_m/2;
00091
00092 SG_DEBUG("m_m=%d\n", m_m);
00093
00094
00095 float64_t mean=0;
00096 float64_t M2=0;
00097
00098
00099 float64_t current;
00100 float64_t delta;
00101
00102 index_t num_examples_processed=0;
00103 index_t term_counter=1;
00104 while (num_examples_processed<m_2)
00105 {
00106
00107 index_t num_this_run=CMath::min(m_blocksize, m_2-num_examples_processed);
00108 SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
00109 "is %d\n", num_this_run, num_examples_processed, m_blocksize);
00110
00111
00112 CFeatures* p1=m_streaming_p->get_streamed_features(num_this_run);
00113 CFeatures* p2=m_streaming_p->get_streamed_features(num_this_run);
00114 CFeatures* q1=m_streaming_q->get_streamed_features(num_this_run);
00115 CFeatures* q2=m_streaming_q->get_streamed_features(num_this_run);
00116 SG_REF(p1);
00117 SG_REF(p2);
00118 SG_REF(q1);
00119 SG_REF(q2);
00120
00121
00122 SG_DEBUG("processing kernel diagonal pp\n");
00123 m_kernel->init(p1, p2);
00124 SGVector<float64_t> pp=m_kernel->get_kernel_diagonal();
00125
00126 SG_DEBUG("processing kernel diagonal qq\n");
00127 m_kernel->init(q1, q2);
00128 SGVector<float64_t> qq=m_kernel->get_kernel_diagonal();
00129
00130 SG_DEBUG("processing kernel diagonal pq\n");
00131 m_kernel->init(p1, q2);
00132 SGVector<float64_t> pq=m_kernel->get_kernel_diagonal();
00133
00134 SG_DEBUG("processing kernel diagonal qp\n");
00135 m_kernel->init(q1, p2);
00136 SGVector<float64_t> qp=m_kernel->get_kernel_diagonal();
00137
00138
00139
00140 for (index_t i=0; i<num_this_run; ++i)
00141 {
00142
00143 current=pp[i]+qq[i]-pq[i]-qp[i];
00144
00145
00146 delta=current-mean;
00147 mean=mean+delta/term_counter++;
00148 M2=M2+delta*(current-mean);
00149
00150 SG_DEBUG("burst: current=%f, delta=%f, mean=%f, M2=%f\n",
00151 current, delta, mean, M2);
00152 }
00153
00154
00155 SG_UNREF(p1);
00156 SG_UNREF(p2);
00157 SG_UNREF(q1);
00158 SG_UNREF(q2);
00159
00160
00161 num_examples_processed+=num_this_run;
00162 }
00163 SG_DEBUG("Done compouting statistic, processed 2*%d examples.\n",
00164 num_examples_processed);
00165
00166
00167 statistic=mean;
00168 SG_DEBUG("statistic %f\n", statistic);
00169
00170
00171
00172
00173 variance=M2/(m_2-1)/m_2;
00174 SG_DEBUG("variance: %f\n", variance);
00175
00176 SG_DEBUG("leaving CLinearTimeMMD::compute_statistic_and_variance()\n");
00177 }
00178
00179 float64_t CLinearTimeMMD::compute_statistic()
00180 {
00181 float64_t statistic=0;
00182 float64_t variance=0;
00183
00184
00185 compute_statistic_and_variance(statistic, variance);
00186
00187 return statistic;
00188 }
00189
00190 float64_t CLinearTimeMMD::compute_variance_estimate()
00191 {
00192 float64_t statistic=0;
00193 float64_t variance=0;
00194
00195
00196 compute_statistic_and_variance(statistic, variance);
00197
00198 return variance;
00199 }
00200
00201 float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
00202 {
00203 float64_t result=0;
00204
00205 switch (m_null_approximation_method)
00206 {
00207 case MMD1_GAUSSIAN:
00208 {
00209
00210 float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00211 result=1.0-CStatistics::normal_cdf(statistic, std_dev);
00212 }
00213 break;
00214
00215 default:
00216
00217 result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00218 break;
00219 }
00220
00221 return result;
00222 }
00223
00224 float64_t CLinearTimeMMD::compute_threshold(float64_t alpha)
00225 {
00226 float64_t result=0;
00227
00228 switch (m_null_approximation_method)
00229 {
00230 case MMD1_GAUSSIAN:
00231 {
00232
00233 float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00234 result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
00235 }
00236 break;
00237
00238 default:
00239
00240 result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00241 break;
00242 }
00243
00244 return result;
00245 }
00246
00247 float64_t CLinearTimeMMD::perform_test()
00248 {
00249 float64_t result=0;
00250
00251 switch (m_null_approximation_method)
00252 {
00253 case MMD1_GAUSSIAN:
00254 {
00255
00256 float64_t statistic;
00257 float64_t variance;
00258 compute_statistic_and_variance(statistic, variance);
00259
00260
00261 result=1.0-CStatistics::normal_cdf(statistic,
00262 CMath::sqrt(variance));
00263 }
00264 break;
00265
00266 default:
00267
00268 result=CTestStatistic::perform_test();
00269 break;
00270 }
00271
00272 return result;
00273 }
00274
00275 SGVector<float64_t> CLinearTimeMMD::bootstrap_null()
00276 {
00277 SGVector<float64_t> samples(m_bootstrap_iterations);
00278
00279
00280
00281
00282 CStreamingFeatures* p=m_streaming_p;
00283 CStreamingFeatures* q=m_streaming_q;
00284 SG_REF(p);
00285 SG_REF(q);
00286 for (index_t i=0; i<m_bootstrap_iterations; ++i)
00287 {
00288
00289 if (CMath::random(0,1))
00290 m_streaming_p=p;
00291 else
00292 m_streaming_p=q;
00293
00294 if (CMath::random(0,1))
00295 m_streaming_q=p;
00296 else
00297 m_streaming_q=q;
00298
00299
00300 samples[i]=compute_statistic();
00301 }
00302 m_streaming_p=p;
00303 m_streaming_q=q;
00304 SG_UNREF(p);
00305 SG_UNREF(q);
00306
00307 return samples;
00308 }
00309
00310 #ifdef HAVE_LAPACK
00311 void CLinearTimeMMD::optimize_kernel_weights()
00312 {
00315 if (m_kernel->get_kernel_type()!=K_COMBINED)
00316 {
00317 SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00318 "with a combined kernel!\n");
00319 }
00320
00321 if (m_p_and_q->get_feature_class()!=C_COMBINED)
00322 {
00323 SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00324 "with combined features!\n");
00325 }
00326
00327
00328 CCombinedFeatures* combined_p_and_q=
00329 dynamic_cast<CCombinedFeatures*>(m_p_and_q);
00330 CCombinedKernel* combined_kernel=dynamic_cast<CCombinedKernel*>(m_kernel);
00331 ASSERT(combined_p_and_q);
00332 ASSERT(combined_kernel);
00333
00334 if (combined_kernel->get_num_subkernels()!=
00335 combined_p_and_q->get_num_feature_obj())
00336 {
00337 SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00338 "when number of sub-kernels (%d) equal number of sub-features "
00339 "(%d)\n", combined_kernel->get_num_subkernels(),
00340 combined_p_and_q->get_num_feature_obj());
00341 }
00342
00343
00344 m_kernel->init(m_p_and_q, m_p_and_q);
00345
00346
00347 index_t num_kernels=combined_kernel->get_num_subkernels();
00348 index_t m2=m_m/2;
00349
00350
00351 SGMatrix<float64_t> hs(m2, num_kernels);
00352
00353
00354 SGVector<float64_t> mmds(num_kernels);
00355
00356 float64_t pp;
00357 float64_t qq;
00358 float64_t pq;
00359 float64_t qp;
00360
00361 for (index_t i=0; i<num_kernels; ++i)
00362 {
00363 CKernel* current=combined_kernel->get_kernel(i);
00364 mmds[i]=0;
00365 for (index_t j=0; j<m2; ++j)
00366 {
00367 pp=current->kernel(j, m2+j);
00368 qq=current->kernel(m_m+j, m_m+m2+j);
00369 pq=current->kernel(j, m_m+m2+j);
00370 qp=current->kernel(m2+j, m_m+j);
00371 hs(j, i)=pp+qq-pq-qp;
00372 mmds[i]+=hs(j, i);
00373 }
00374
00375
00376 mmds[i]/=m2;
00377
00378 SG_UNREF(current);
00379 }
00380
00381
00382
00383 m_Q=CStatistics::covariance_matrix(hs, true);
00384
00385
00386
00387 if (m_opt_regularization_eps)
00388 {
00389 SG_DEBUG("regularizing matrix Q by adding %f to diagonal\n",
00390 m_opt_regularization_eps);
00391 for (index_t i=0; i<num_kernels; ++i)
00392 m_Q(i,i)+=m_opt_regularization_eps;
00393 }
00394
00395 if (sg_io->get_loglevel()==MSG_DEBUG)
00396 {
00397 m_Q.display_matrix("(evtl. regularized) Q");
00398 mmds.display_vector("mmds");
00399 }
00400
00401
00402 float64_t sum_mmds=0;
00403 for (index_t i=0; i<mmds.vlen; ++i)
00404 sum_mmds+=mmds[i];
00405
00406
00407
00408
00409
00410 SGVector<float64_t> Q_diag(num_kernels);
00411 SGVector<float64_t> f(num_kernels);
00412 SGVector<float64_t> lb(num_kernels);
00413 SGVector<float64_t> ub(num_kernels);
00414 SGVector<float64_t> x(num_kernels);
00415
00416
00417
00418 bool one_pos;
00419 for (index_t i=0; i<mmds.vlen; ++i)
00420 {
00421 if (mmds[i]>0)
00422 {
00423 SG_DEBUG("found at least one positive MMD\n");
00424 one_pos=true;
00425 break;
00426 }
00427 one_pos=false;
00428 }
00429
00430 if (!one_pos)
00431 {
00432 SG_WARNING("All mmd estimates are negative. This is techical possible,"
00433 " although extremely rare. Current problem might bad\n");
00434
00435
00436 for (index_t i=0; i<num_kernels*num_kernels; ++i)
00437 m_Q.matrix[i]=-m_Q.matrix[i];
00438 }
00439
00440
00441 for (index_t i=0; i<num_kernels; ++i)
00442 {
00443 Q_diag[i]=m_Q(i,i);
00444 f[i]=0;
00445 lb[i]=0;
00446 ub[i]=CMath::INFTY;
00447
00448
00449 x[i]=1.0/sum_mmds;
00450 }
00451
00452
00453 SG_DEBUG("starting libqp optimization\n");
00454 libqp_state_T qp_exitflag=libqp_gsmo_solver(&get_Q_col, Q_diag.vector,
00455 f.vector, mmds.vector,
00456 one_pos ? 1 : -1,
00457 lb.vector, ub.vector,
00458 x.vector, num_kernels, m_opt_max_iterations,
00459 m_opt_regularization_eps, &print_state);
00460
00461 SG_DEBUG("libqp returns: nIts=%d, exit_flag: %d\n", qp_exitflag.nIter,
00462 qp_exitflag.exitflag);
00463
00464
00465 float64_t sum_weights=0;
00466 for (index_t i=0; i<x.vlen; ++i)
00467 {
00468 if (x[i]<m_opt_low_cut)
00469 {
00470 SG_DEBUG("lowcut: weight[%i]=%f<%f; setting to zero\n", i, x[i],
00471 m_opt_low_cut);
00472 x[i]=0;
00473 }
00474
00475 sum_weights+=x[i];
00476 }
00477
00478
00479 for (index_t i=0; i<x.vlen; ++i)
00480 x[i]/=sum_weights;
00481
00482
00483 m_kernel->set_subkernel_weights(x);
00484 }
00485
00486 SGMatrix<float64_t> CLinearTimeMMD::m_Q=SGMatrix<float64_t>();
00487
00488 const float64_t* CLinearTimeMMD::get_Q_col(uint32_t i)
00489 {
00490 return &m_Q[m_Q.num_rows*i];
00491 }
00492
00493 void CLinearTimeMMD::print_state(libqp_state_T state)
00494 {
00495 SG_SDEBUG("libqp state: primal=%f\n", state.QP);
00496 }
00497
00498 #endif //HAVE_LAPACK
00499