SHOGUN: QuadraticTimeMMD.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/QuadraticTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/mathematics/Statistics.h>
00013 #include <shogun/kernel/Kernel.h>
00014 
00015 using namespace shogun;
00016 
00017 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTestStatistic()
00018 {
00019     init();
00020 }
00021 
00022 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q,
00023         index_t m) :
00024         CKernelTwoSampleTestStatistic(kernel, p_and_q, m)
00025 {
00026     init();
00027 
00028     if (p_and_q && m!=p_and_q->get_num_vectors()/2)
00029     {
00030         SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00031                 "are currently possible\n");
00032     }
00033 }
00034 
00035 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p,
00036         CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q)
00037 {
00038     init();
00039 
00040     if (p && q && p->get_num_vectors()!=q->get_num_vectors())
00041     {
00042         SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00043                 "are currently possible\n");
00044     }
00045 }
00046 
00047 CQuadraticTimeMMD::~CQuadraticTimeMMD()
00048 {
00049 
00050 }
00051 
00052 void CQuadraticTimeMMD::init()
00053 {
00054     SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
00055             " for spectrum method null-distribution approximation",
00056             MS_NOT_AVAILABLE);
00057     SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
00058             " Eigenvalues for spectrum method null-distribution approximation",
00059             MS_NOT_AVAILABLE);
00060     SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
00061             "Biased or unbiased MMD", MS_NOT_AVAILABLE);
00062 
00063     m_num_samples_spectrum=0;
00064     m_num_eigenvalues_spectrum=0;
00065     m_statistic_type=UNBIASED;
00066 }
00067 
00068 float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
00069 {
00070     /* split computations into three terms from JLMR paper (see documentation )*/
00071     index_t m=m_m;
00072 
00073     /* init kernel with features */
00074     m_kernel->init(m_p_and_q, m_p_and_q);
00075 
00076     /* first term */
00077     float64_t first=0;
00078     for (index_t i=0; i<m; ++i)
00079     {
00080         /* ensure i!=j while adding up */
00081         for (index_t j=0; j<m; ++j)
00082             first+=i==j ? 0 : m_kernel->kernel(i,j);
00083     }
00084     first/=(m-1);
00085 
00086     /* second term */
00087     float64_t second=0;
00088     for (index_t i=m_m; i<m_m+m; ++i)
00089     {
00090         /* ensure i!=j while adding up */
00091         for (index_t j=m_m; j<m_m+m; ++j)
00092             second+=i==j ? 0 : m_kernel->kernel(i,j);
00093     }
00094     second/=(m-1);
00095 
00096     /* third term */
00097     float64_t third=0;
00098     for (index_t i=0; i<m; ++i)
00099     {
00100         for (index_t j=m_m; j<m_m+m; ++j)
00101             third+=m_kernel->kernel(i,j);
00102     }
00103     third*=2.0/m;
00104 
00105     return first+second-third;
00106 }
00107 
00108 float64_t CQuadraticTimeMMD::compute_biased_statistic()
00109 {
00110     /* split computations into three terms from JLMR paper (see documentation )*/
00111     index_t m=m_m;
00112 
00113     /* init kernel with features */
00114     m_kernel->init(m_p_and_q, m_p_and_q);
00115 
00116     /* first term */
00117     float64_t first=0;
00118     for (index_t i=0; i<m; ++i)
00119     {
00120         for (index_t j=0; j<m; ++j)
00121             first+=m_kernel->kernel(i,j);
00122     }
00123     first/=m;
00124 
00125     /* second term */
00126     float64_t second=0;
00127     for (index_t i=m_m; i<m_m+m; ++i)
00128     {
00129         for (index_t j=m_m; j<m_m+m; ++j)
00130             second+=m_kernel->kernel(i,j);
00131     }
00132     second/=m;
00133 
00134     /* third term */
00135     float64_t third=0;
00136     for (index_t i=0; i<m; ++i)
00137     {
00138         for (index_t j=m_m; j<m_m+m; ++j)
00139             third+=m_kernel->kernel(i,j);
00140     }
00141     third*=2.0/m;
00142 
00143     return first+second-third;
00144 }
00145 
00146 float64_t CQuadraticTimeMMD::compute_statistic()
00147 {
00148     if (!m_kernel)
00149         SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name());
00150 
00151     float64_t result=0;
00152     switch (m_statistic_type)
00153     {
00154     case UNBIASED:
00155         result=compute_unbiased_statistic();
00156         break;
00157     case BIASED:
00158         result=compute_biased_statistic();
00159         break;
00160     default:
00161         SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic "
00162                 "type!\n");
00163         break;
00164     }
00165 
00166     return result;
00167 }
00168 
00169 float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
00170 {
00171     float64_t result=0;
00172 
00173     switch (m_null_approximation_method)
00174     {
00175     case MMD2_SPECTRUM:
00176     {
00177 #ifdef HAVE_LAPACK
00178         /* get samples from null-distribution and compute p-value of statistic */
00179         SGVector<float64_t> null_samples=sample_null_spectrum(
00180                 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00181         CMath::qsort(null_samples);
00182         index_t pos=CMath::find_position_to_insert(null_samples, statistic);
00183         result=1.0-((float64_t)pos)/null_samples.vlen;
00184 #else // HAVE_LAPACK
00185         SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
00186                 "shogun is compiled with LAPACK enabled\n");
00187 #endif // HAVE_LAPACK
00188         break;
00189     }
00190 
00191     case MMD2_GAMMA:
00192     {
00193         /* fit gamma and return cdf at statistic */
00194         SGVector<float64_t> params=fit_null_gamma();
00195         result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
00196         break;
00197     }
00198 
00199     default:
00200         result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00201         break;
00202     }
00203 
00204     return result;
00205 }
00206 
00207 float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
00208 {
00209     float64_t result=0;
00210 
00211     switch (m_null_approximation_method)
00212     {
00213     case MMD2_SPECTRUM:
00214     {
00215 #ifdef HAVE_LAPACK
00216         /* get samples from null-distribution and compute threshold */
00217         SGVector<float64_t> null_samples=sample_null_spectrum(
00218                 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00219         CMath::qsort(null_samples);
00220         result=null_samples[CMath::floor(null_samples.vlen*(1-alpha))];
00221 #else // HAVE_LAPACK
00222         SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if "
00223                 "shogun is compiled with LAPACK enabled\n");
00224 #endif // HAVE_LAPACK
00225         break;
00226     }
00227 
00228     case MMD2_GAMMA:
00229     {
00230         /* fit gamma and return inverse cdf at alpha */
00231         SGVector<float64_t> params=fit_null_gamma();
00232         result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
00233         break;
00234     }
00235 
00236     default:
00237         /* bootstrapping is handled here */
00238         result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00239         break;
00240     }
00241 
00242     return result;
00243 }
00244 
00245 
00246 #ifdef HAVE_LAPACK
00247 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
00248         index_t num_eigenvalues)
00249 {
00250     if (m_m!=m_p_and_q->get_num_vectors()/2)
00251     {
00252         SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
00253                 "sample sizes are supported\n", get_name());
00254     }
00255 
00256     if (num_samples<=2)
00257     {
00258         SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at"
00259                 " least 2, better in the hundrets", get_name());
00260     }
00261 
00262     if (num_eigenvalues>2*m_m-1)
00263     {
00264         SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
00265                 get_name());
00266     }
00267 
00268     if (num_eigenvalues<1)
00269     {
00270         SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n",
00271                 get_name());
00272     }
00273 
00274     /* evtl. warn user not to use wrong statistic type */
00275     if (m_statistic_type!=BIASED)
00276     {
00277         SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
00278                 "to be BIASED. Please ensure that! To get rid of warning,"
00279                 "call %s::set_statistic_type(BIASED)\n", get_name(),
00280                 get_name());
00281     }
00282 
00283     /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
00284      * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
00285      * works since X and Y are concatenated here */
00286     m_kernel->init(m_p_and_q, m_p_and_q);
00287     SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();
00288 
00289     /* center matrix K=H*K*H */
00290     K.center();
00291 
00292     /* compute eigenvalues and select num_eigenvalues largest ones */
00293     SGVector<float64_t> eigenvalues=
00294             SGMatrix<float64_t>::compute_eigenvectors(K);
00295     SGVector<float64_t> largest_ev(num_eigenvalues);
00296 
00297     /* take largest EV, scale by 1/2/m on the fly and take abs value*/
00298     for (index_t i=0; i<num_eigenvalues; ++i)
00299         largest_ev[i]=CMath::abs(
00300                 1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
00301 
00302     /* finally, sample from null distribution */
00303     SGVector<float64_t> null_samples(num_samples);
00304     for (index_t i=0; i<num_samples; ++i)
00305     {
00306         /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
00307         null_samples[i]=0;
00308         for (index_t j=0; j<largest_ev.vlen; ++j)
00309             null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
00310 
00311         null_samples[i]*=2;
00312     }
00313 
00314     return null_samples;
00315 }
00316 #endif // HAVE_LAPACK
00317 
00318 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma()
00319 {
00320     SG_WARNING("CQuadraticTimeMMD::fit_null_gamma(): TODO: combine with "
00321             "compute_statistic to be more efficient!\n");
00322 
00323     if (m_m!=m_p_and_q->get_num_vectors()/2)
00324     {
00325         SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal "
00326                 "sample sizes are supported\n", get_name());
00327     }
00328 
00329     /* evtl. warn user not to use wrong statistic type */
00330     if (m_statistic_type!=BIASED)
00331     {
00332         SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
00333                 "to be BIASED. Please ensure that! To get rid of warning,"
00334                 "call %s::set_statistic_type(BIASED)\n", get_name(),
00335                 get_name());
00336     }
00337 
00338     /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
00339      * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
00340      * works since X and Y are concatenated here */
00341     m_kernel->init(m_p_and_q, m_p_and_q);
00342 
00343     /* compute mean under H0 of MMD, which is
00344      * meanMMD  = 2/m * ( 1  - 1/m*sum(diag(KL))  );
00345      * in MATLAB.
00346      * Remove diagonals on the fly */
00347     float64_t mean_mmd=0;
00348     for (index_t i=0; i<m_m; ++i)
00349     {
00350         /* virtual KL matrix is in upper right corner of SHOGUN K matrix
00351          * so this sums the diagonal of the matrix between X and Y*/
00352         mean_mmd+=m_kernel->kernel(i, m_m+i);
00353     }
00354     mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd);
00355 
00356     /* compute variance under H0 of MMD, which is
00357      * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
00358      * in MATLAB, so sum up all elements */
00359     float64_t var_mmd=0;
00360     for (index_t i=0; i<m_m; ++i)
00361     {
00362         for (index_t j=0; j<m_m; ++j)
00363         {
00364             /* dont add diagonal of all pairs of imaginary kernel matrices */
00365             if (i==j || m_m+i==j || m_m+j==i)
00366                 continue;
00367 
00368             float64_t to_add=m_kernel->kernel(i, j);
00369             to_add+=m_kernel->kernel(m_m+i, m_m+j);
00370             to_add-=m_kernel->kernel(i, m_m+j);
00371             to_add-=m_kernel->kernel(m_m+i, j);
00372             var_mmd+=CMath::pow(to_add, 2);
00373         }
00374     }
00375     var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1);
00376 
00377     /* parameters for gamma distribution */
00378     float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
00379     float64_t b=var_mmd*m_m / mean_mmd;
00380 
00381     SGVector<float64_t> result(2);
00382     result[0]=a;
00383     result[1]=b;
00384 
00385     return result;
00386 }
00387 
00388 void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t
00389         num_samples_spectrum)
00390 {
00391     m_num_samples_spectrum=num_samples_spectrum;
00392 }
00393 
00394 void CQuadraticTimeMMD::set_num_eigenvalues_spectrum(
00395         index_t num_eigenvalues_spectrum)
00396 {
00397     m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
00398 }
00399 
00400 void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType
00401         statistic_type)
00402 {
00403     m_statistic_type=statistic_type;
00404 }