SHOGUN: LinearTimeMMD.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/LinearTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/features/streaming/StreamingFeatures.h>
00013 #include <shogun/mathematics/Statistics.h>
00014 #include <shogun/features/CombinedFeatures.h>
00015 #include <shogun/kernel/CombinedKernel.h>
00016 
00017 #include <shogun/lib/external/libqp.h>
00018 
00019 using namespace shogun;
00020 
00021 CLinearTimeMMD::CLinearTimeMMD() :
00022         CKernelTwoSampleTestStatistic()
00023 {
00024     init();
00025 }
00026 
00027 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p,
00028         CStreamingFeatures* q, index_t m, index_t blocksize) :
00029         CKernelTwoSampleTestStatistic(kernel, NULL, m)
00030 {
00031     init();
00032 
00033     m_streaming_p=p;
00034     SG_REF(m_streaming_p);
00035 
00036     m_streaming_q=q;
00037     SG_REF(m_streaming_q);
00038 
00039     m_blocksize=blocksize;
00040 }
00041 
00042 CLinearTimeMMD::~CLinearTimeMMD()
00043 {
00044     SG_UNREF(m_streaming_p);
00045     SG_UNREF(m_streaming_q);
00046 
00047     /* m_kernel is SG_UNREFed in base desctructor */
00048 }
00049 
00050 void CLinearTimeMMD::init()
00051 {
00052 #ifdef HAVE_LAPACK
00053     SG_ADD(&m_opt_max_iterations, "opt_max_iterations", "Maximum number of "
00054             "iterations for qp solver", MS_NOT_AVAILABLE);
00055     SG_ADD(&m_opt_epsilon, "opt_epsilon", "Stopping criterion for qp solver",
00056             MS_NOT_AVAILABLE);
00057     SG_ADD(&m_opt_low_cut, "opt_low_cut", "Low cut value for optimization "
00058             "kernel weights", MS_NOT_AVAILABLE);
00059     SG_ADD(&m_opt_regularization_eps, "opt_regularization_eps", "Regularization"
00060             " value that is added to diagonal of Q matrix", MS_NOT_AVAILABLE);
00061 
00062     m_opt_max_iterations=10000;
00063     m_opt_epsilon=10E-15;
00064     m_opt_low_cut=10E-7;
00065     m_opt_regularization_eps=0;
00066 #endif //HAVE_LAPACK
00067 
00068     m_streaming_p=NULL;
00069     m_streaming_q=NULL;
00070     m_blocksize=10000;
00071 
00072     SG_WARNING("%s::init(): register params!\n", get_name());
00073 }
00074 
00075 void CLinearTimeMMD::compute_statistic_and_variance(
00076         float64_t& statistic, float64_t& variance)
00077 {
00078     SG_DEBUG("entering CLinearTimeMMD::compute_statistic_and_variance()\n");
00079 
00080     REQUIRE(m_streaming_p, "%s::compute_statistic_and_variance: streaming "
00081             "features p required!\n", get_name());
00082     REQUIRE(m_streaming_q, "%s::compute_statistic_and_variance: streaming "
00083             "features q required!\n", get_name());
00084 
00085     REQUIRE(m_kernel, "%s::compute_statistic_and_variance: kernel needed!\n",
00086             get_name());
00087 
00088     /* m is number of samples from each distribution, m_2 is half of it
00089      * using names from JLMR paper (see class documentation) */
00090     index_t m_2=m_m/2;
00091 
00092     SG_DEBUG("m_m=%d\n", m_m);
00093 
00094     /* these sums are needed to compute online statistic/variance */
00095     float64_t mean=0;
00096     float64_t M2=0;
00097 
00098     /* temp variable in the algorithm */
00099     float64_t current;
00100     float64_t delta;
00101 
00102     index_t num_examples_processed=0;
00103     index_t term_counter=1;
00104     while (num_examples_processed<m_2)
00105     {
00106         /* number of example to look at in this iteration */
00107         index_t num_this_run=CMath::min(m_blocksize, m_2-num_examples_processed);
00108         SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
00109                 "is %d\n", num_this_run, num_examples_processed, m_blocksize);
00110 
00111         /* stream data from both distributions */
00112         CFeatures* p1=m_streaming_p->get_streamed_features(num_this_run);
00113         CFeatures* p2=m_streaming_p->get_streamed_features(num_this_run);
00114         CFeatures* q1=m_streaming_q->get_streamed_features(num_this_run);
00115         CFeatures* q2=m_streaming_q->get_streamed_features(num_this_run);
00116         SG_REF(p1);
00117         SG_REF(p2);
00118         SG_REF(q1);
00119         SG_REF(q2);
00120 
00121         /* compute kernel matrix diagonals */
00122         SG_DEBUG("processing kernel diagonal pp\n");
00123         m_kernel->init(p1, p2);
00124         SGVector<float64_t> pp=m_kernel->get_kernel_diagonal();
00125 
00126         SG_DEBUG("processing kernel diagonal qq\n");
00127         m_kernel->init(q1, q2);
00128         SGVector<float64_t> qq=m_kernel->get_kernel_diagonal();
00129 
00130         SG_DEBUG("processing kernel diagonal pq\n");
00131         m_kernel->init(p1, q2);
00132         SGVector<float64_t> pq=m_kernel->get_kernel_diagonal();
00133 
00134         SG_DEBUG("processing kernel diagonal qp\n");
00135         m_kernel->init(q1, p2);
00136         SGVector<float64_t> qp=m_kernel->get_kernel_diagonal();
00137 
00138         /* update mean and variance using Knuth's online variance algorithm.
00139          * C.f. for example Wikipedia */
00140         for (index_t i=0; i<num_this_run; ++i)
00141         {
00142             /* compute sum of current h terms */
00143             current=pp[i]+qq[i]-pq[i]-qp[i];
00144 
00145             /* D. Knuth's online variance algorithm */
00146             delta=current-mean;
00147             mean=mean+delta/term_counter++;
00148             M2=M2+delta*(current-mean);
00149 
00150             SG_DEBUG("burst: current=%f, delta=%f, mean=%f, M2=%f\n",
00151                     current, delta, mean, M2);
00152         }
00153 
00154         /* clean up */
00155         SG_UNREF(p1);
00156         SG_UNREF(p2);
00157         SG_UNREF(q1);
00158         SG_UNREF(q2);
00159 
00160         /* add number of processed examples for this run */
00161         num_examples_processed+=num_this_run;
00162     }
00163     SG_DEBUG("Done compouting statistic, processed 2*%d examples.\n",
00164             num_examples_processed);
00165 
00166     /* mean of sum all traces is linear time mmd */
00167     statistic=mean;
00168     SG_DEBUG("statistic %f\n", statistic);
00169 
00170     /* variance of terms can be computed using mean (statistic).
00171      * Note that the variance needs to be divided by m_2 in order to get
00172      * variance of null-distribution */
00173     variance=M2/(m_2-1)/m_2;
00174     SG_DEBUG("variance: %f\n", variance);
00175 
00176     SG_DEBUG("leaving CLinearTimeMMD::compute_statistic_and_variance()\n");
00177 }
00178 
00179 float64_t CLinearTimeMMD::compute_statistic()
00180 {
00181     float64_t statistic=0;
00182     float64_t variance=0;
00183 
00184     /* use wrapper method */
00185     compute_statistic_and_variance(statistic, variance);
00186 
00187     return statistic;
00188 }
00189 
00190 float64_t CLinearTimeMMD::compute_variance_estimate()
00191 {
00192     float64_t statistic=0;
00193     float64_t variance=0;
00194 
00195     /* use wrapper method */
00196     compute_statistic_and_variance(statistic, variance);
00197 
00198     return variance;
00199 }
00200 
00201 float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
00202 {
00203     float64_t result=0;
00204 
00205     switch (m_null_approximation_method)
00206     {
00207     case MMD1_GAUSSIAN:
00208         {
00209             /* compute variance and use to estimate Gaussian distribution */
00210             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00211             result=1.0-CStatistics::normal_cdf(statistic, std_dev);
00212         }
00213         break;
00214 
00215     default:
00216         /* bootstrapping is handled here */
00217         result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00218         break;
00219     }
00220 
00221     return result;
00222 }
00223 
00224 float64_t CLinearTimeMMD::compute_threshold(float64_t alpha)
00225 {
00226     float64_t result=0;
00227 
00228     switch (m_null_approximation_method)
00229     {
00230     case MMD1_GAUSSIAN:
00231         {
00232             /* compute variance and use to estimate Gaussian distribution */
00233             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00234             result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
00235         }
00236         break;
00237 
00238     default:
00239         /* bootstrapping is handled here */
00240         result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00241         break;
00242     }
00243 
00244     return result;
00245 }
00246 
00247 float64_t CLinearTimeMMD::perform_test()
00248 {
00249     float64_t result=0;
00250 
00251     switch (m_null_approximation_method)
00252     {
00253     case MMD1_GAUSSIAN:
00254         {
00255             /* compute statistic and variance in the same loop */
00256             float64_t statistic;
00257             float64_t variance;
00258             compute_statistic_and_variance(statistic, variance);
00259 
00260             /* estimate Gaussian distribution */
00261             result=1.0-CStatistics::normal_cdf(statistic,
00262                     CMath::sqrt(variance));
00263         }
00264         break;
00265 
00266     default:
00267         /* bootstrapping can be done separately in superclass */
00268         result=CTestStatistic::perform_test();
00269         break;
00270     }
00271 
00272     return result;
00273 }
00274 
00275 SGVector<float64_t> CLinearTimeMMD::bootstrap_null()
00276 {
00277     SGVector<float64_t> samples(m_bootstrap_iterations);
00278 
00279     /* instead of permutating samples, just samples new data all the time.
00280      * In order to merge p and q, simply randomly select p and q for each
00281      * feature object inernally */
00282     CStreamingFeatures* p=m_streaming_p;
00283     CStreamingFeatures* q=m_streaming_q;
00284     SG_REF(p);
00285     SG_REF(q);
00286     for (index_t i=0; i<m_bootstrap_iterations; ++i)
00287     {
00288         /* merge samples by randomly shuffling p and q */
00289         if (CMath::random(0,1))
00290             m_streaming_p=p;
00291         else
00292             m_streaming_p=q;
00293 
00294         if (CMath::random(0,1))
00295             m_streaming_q=p;
00296         else
00297             m_streaming_q=q;
00298 
00299         /* compute statistic for this permutation of mixed samples */
00300         samples[i]=compute_statistic();
00301     }
00302     m_streaming_p=p;
00303     m_streaming_q=q;
00304     SG_UNREF(p);
00305     SG_UNREF(q);
00306 
00307     return samples;
00308 }
00309 
00310 #ifdef HAVE_LAPACK
00311 void CLinearTimeMMD::optimize_kernel_weights()
00312 {
00315     if (m_kernel->get_kernel_type()!=K_COMBINED)
00316     {
00317         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00318                 "with a combined kernel!\n");
00319     }
00320 
00321     if (m_p_and_q->get_feature_class()!=C_COMBINED)
00322     {
00323         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00324                 "with combined features!\n");
00325     }
00326 
00327     /* think about casting and types of different kernels/features here */
00328     CCombinedFeatures* combined_p_and_q=
00329             dynamic_cast<CCombinedFeatures*>(m_p_and_q);
00330     CCombinedKernel* combined_kernel=dynamic_cast<CCombinedKernel*>(m_kernel);
00331     ASSERT(combined_p_and_q);
00332     ASSERT(combined_kernel);
00333 
00334     if (combined_kernel->get_num_subkernels()!=
00335             combined_p_and_q->get_num_feature_obj())
00336     {
00337         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00338                 "when number of sub-kernels (%d) equal number of sub-features "
00339                 "(%d)\n", combined_kernel->get_num_subkernels(),
00340                 combined_p_and_q->get_num_feature_obj());
00341     }
00342 
00343     /* init kernel with features */
00344     m_kernel->init(m_p_and_q, m_p_and_q);
00345 
00346     /* number of kernels and data */
00347     index_t num_kernels=combined_kernel->get_num_subkernels();
00348     index_t m2=m_m/2;
00349 
00350     /* matrix with all h entries for all kernels and data */
00351     SGMatrix<float64_t> hs(m2, num_kernels);
00352 
00353     /* mmds are needed and are means of columns of hs */
00354     SGVector<float64_t> mmds(num_kernels);
00355 
00356     float64_t pp;
00357     float64_t qq;
00358     float64_t pq;
00359     float64_t qp;
00360     /* compute all h entries */
00361     for (index_t i=0; i<num_kernels; ++i)
00362     {
00363         CKernel* current=combined_kernel->get_kernel(i);
00364         mmds[i]=0;
00365         for (index_t j=0; j<m2; ++j)
00366         {
00367             pp=current->kernel(j, m2+j);
00368             qq=current->kernel(m_m+j, m_m+m2+j);
00369             pq=current->kernel(j, m_m+m2+j);
00370             qp=current->kernel(m2+j, m_m+j);
00371             hs(j, i)=pp+qq-pq-qp;
00372             mmds[i]+=hs(j, i);
00373         }
00374 
00375         /* mmd is simply mean. This is the unbiased linear time estimate */
00376         mmds[i]/=m2;
00377 
00378         SG_UNREF(current);
00379     }
00380 
00381     /* compute covariance matrix of h vector, in place is safe now since h
00382      * is not needed anymore */
00383     m_Q=CStatistics::covariance_matrix(hs, true);
00384 
00385     /* evtl regularize to avoid numerical problems (ratio of MMD and std-dev
00386      * blows up when variance is small */
00387     if (m_opt_regularization_eps)
00388     {
00389         SG_DEBUG("regularizing matrix Q by adding %f to diagonal\n",
00390                 m_opt_regularization_eps);
00391         for (index_t i=0; i<num_kernels; ++i)
00392             m_Q(i,i)+=m_opt_regularization_eps;
00393     }
00394 
00395     if (sg_io->get_loglevel()==MSG_DEBUG)
00396     {
00397         m_Q.display_matrix("(evtl. regularized) Q");
00398         mmds.display_vector("mmds");
00399     }
00400 
00401     /* compute sum of mmds to generate feasible point for convex program */
00402     float64_t sum_mmds=0;
00403     for (index_t i=0; i<mmds.vlen; ++i)
00404         sum_mmds+=mmds[i];
00405 
00406     /* QP: 0.5*x'*Q*x + f'*x
00407      * subject to
00408      * mmds'*x = b
00409      * LB[i] <= x[i] <= UB[i]   for all i=1..n */
00410     SGVector<float64_t> Q_diag(num_kernels);
00411     SGVector<float64_t> f(num_kernels);
00412     SGVector<float64_t> lb(num_kernels);
00413     SGVector<float64_t> ub(num_kernels);
00414     SGVector<float64_t> x(num_kernels);
00415 
00416     /* init everything, there are two cases possible: i) at least one mmd is
00417      * is positive, ii) all mmds are negative */
00418     bool one_pos;
00419     for (index_t i=0; i<mmds.vlen; ++i)
00420     {
00421         if (mmds[i]>0)
00422         {
00423             SG_DEBUG("found at least one positive MMD\n");
00424             one_pos=true;
00425             break;
00426         }
00427         one_pos=false;
00428     }
00429 
00430     if (!one_pos)
00431     {
00432         SG_WARNING("All mmd estimates are negative. This is techical possible,"
00433                 " although extremely rare. Current problem might bad\n");
00434 
00435         /* if no element is positive, Q has to be replaced by -Q */
00436         for (index_t i=0; i<num_kernels*num_kernels; ++i)
00437             m_Q.matrix[i]=-m_Q.matrix[i];
00438     }
00439 
00440     /* init vectors */
00441     for (index_t i=0; i<num_kernels; ++i)
00442     {
00443         Q_diag[i]=m_Q(i,i);
00444         f[i]=0;
00445         lb[i]=0;
00446         ub[i]=CMath::INFTY;
00447 
00448         /* initial point has to be feasible, i.e. mmds'*x = b */
00449         x[i]=1.0/sum_mmds;
00450     }
00451 
00452     /* start libqp solver with desired parameters */
00453     SG_DEBUG("starting libqp optimization\n");
00454     libqp_state_T qp_exitflag=libqp_gsmo_solver(&get_Q_col, Q_diag.vector,
00455             f.vector, mmds.vector,
00456             one_pos ? 1 : -1,
00457             lb.vector, ub.vector,
00458             x.vector, num_kernels, m_opt_max_iterations,
00459             m_opt_regularization_eps, &print_state);
00460 
00461     SG_DEBUG("libqp returns: nIts=%d, exit_flag: %d\n", qp_exitflag.nIter,
00462             qp_exitflag.exitflag);
00463 
00464     /* set really small entries to zero and sum up for normalization */
00465     float64_t sum_weights=0;
00466     for (index_t i=0; i<x.vlen; ++i)
00467     {
00468         if (x[i]<m_opt_low_cut)
00469         {
00470             SG_DEBUG("lowcut: weight[%i]=%f<%f; setting to zero\n", i, x[i],
00471                     m_opt_low_cut);
00472             x[i]=0;
00473         }
00474 
00475         sum_weights+=x[i];
00476     }
00477 
00478     /* normalize (allowed since problem is scale invariant) */
00479     for (index_t i=0; i<x.vlen; ++i)
00480         x[i]/=sum_weights;
00481 
00482     /* set weights to kernel */
00483     m_kernel->set_subkernel_weights(x);
00484 }
00485 
00486 SGMatrix<float64_t> CLinearTimeMMD::m_Q=SGMatrix<float64_t>();
00487 
00488 const float64_t* CLinearTimeMMD::get_Q_col(uint32_t i)
00489 {
00490     return &m_Q[m_Q.num_rows*i];
00491 }
00492 
00493 void CLinearTimeMMD::print_state(libqp_state_T state)
00494 {
00495     SG_SDEBUG("libqp state: primal=%f\n", state.QP);
00496 }
00497 
00498 #endif //HAVE_LAPACK
00499