00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/statistics/QuadraticTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/mathematics/Statistics.h>
00013 #include <shogun/kernel/Kernel.h>
00014
00015 using namespace shogun;
00016
00017 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTestStatistic()
00018 {
00019 init();
00020 }
00021
00022 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q,
00023 index_t m) :
00024 CKernelTwoSampleTestStatistic(kernel, p_and_q, m)
00025 {
00026 init();
00027
00028 if (p_and_q && m!=p_and_q->get_num_vectors()/2)
00029 {
00030 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00031 "are currently possible\n");
00032 }
00033 }
00034
00035 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p,
00036 CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q)
00037 {
00038 init();
00039
00040 if (p && q && p->get_num_vectors()!=q->get_num_vectors())
00041 {
00042 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00043 "are currently possible\n");
00044 }
00045 }
00046
00047 CQuadraticTimeMMD::~CQuadraticTimeMMD()
00048 {
00049
00050 }
00051
00052 void CQuadraticTimeMMD::init()
00053 {
00054 SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
00055 " for spectrum method null-distribution approximation",
00056 MS_NOT_AVAILABLE);
00057 SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
00058 " Eigenvalues for spectrum method null-distribution approximation",
00059 MS_NOT_AVAILABLE);
00060 SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
00061 "Biased or unbiased MMD", MS_NOT_AVAILABLE);
00062
00063 m_num_samples_spectrum=0;
00064 m_num_eigenvalues_spectrum=0;
00065 m_statistic_type=UNBIASED;
00066 }
00067
00068 float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
00069 {
00070
00071 index_t m=m_m;
00072
00073
00074 m_kernel->init(m_p_and_q, m_p_and_q);
00075
00076
00077 float64_t first=0;
00078 for (index_t i=0; i<m; ++i)
00079 {
00080
00081 for (index_t j=0; j<m; ++j)
00082 first+=i==j ? 0 : m_kernel->kernel(i,j);
00083 }
00084 first/=(m-1);
00085
00086
00087 float64_t second=0;
00088 for (index_t i=m_m; i<m_m+m; ++i)
00089 {
00090
00091 for (index_t j=m_m; j<m_m+m; ++j)
00092 second+=i==j ? 0 : m_kernel->kernel(i,j);
00093 }
00094 second/=(m-1);
00095
00096
00097 float64_t third=0;
00098 for (index_t i=0; i<m; ++i)
00099 {
00100 for (index_t j=m_m; j<m_m+m; ++j)
00101 third+=m_kernel->kernel(i,j);
00102 }
00103 third*=2.0/m;
00104
00105 return first+second-third;
00106 }
00107
00108 float64_t CQuadraticTimeMMD::compute_biased_statistic()
00109 {
00110
00111 index_t m=m_m;
00112
00113
00114 m_kernel->init(m_p_and_q, m_p_and_q);
00115
00116
00117 float64_t first=0;
00118 for (index_t i=0; i<m; ++i)
00119 {
00120 for (index_t j=0; j<m; ++j)
00121 first+=m_kernel->kernel(i,j);
00122 }
00123 first/=m;
00124
00125
00126 float64_t second=0;
00127 for (index_t i=m_m; i<m_m+m; ++i)
00128 {
00129 for (index_t j=m_m; j<m_m+m; ++j)
00130 second+=m_kernel->kernel(i,j);
00131 }
00132 second/=m;
00133
00134
00135 float64_t third=0;
00136 for (index_t i=0; i<m; ++i)
00137 {
00138 for (index_t j=m_m; j<m_m+m; ++j)
00139 third+=m_kernel->kernel(i,j);
00140 }
00141 third*=2.0/m;
00142
00143 return first+second-third;
00144 }
00145
00146 float64_t CQuadraticTimeMMD::compute_statistic()
00147 {
00148 if (!m_kernel)
00149 SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name());
00150
00151 float64_t result=0;
00152 switch (m_statistic_type)
00153 {
00154 case UNBIASED:
00155 result=compute_unbiased_statistic();
00156 break;
00157 case BIASED:
00158 result=compute_biased_statistic();
00159 break;
00160 default:
00161 SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic "
00162 "type!\n");
00163 break;
00164 }
00165
00166 return result;
00167 }
00168
00169 float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
00170 {
00171 float64_t result=0;
00172
00173 switch (m_null_approximation_method)
00174 {
00175 case MMD2_SPECTRUM:
00176 {
00177 #ifdef HAVE_LAPACK
00178
00179 SGVector<float64_t> null_samples=sample_null_spectrum(
00180 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00181 CMath::qsort(null_samples);
00182 index_t pos=CMath::find_position_to_insert(null_samples, statistic);
00183 result=1.0-((float64_t)pos)/null_samples.vlen;
00184 #else // HAVE_LAPACK
00185 SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
00186 "shogun is compiled with LAPACK enabled\n");
00187 #endif // HAVE_LAPACK
00188 break;
00189 }
00190
00191 case MMD2_GAMMA:
00192 {
00193
00194 SGVector<float64_t> params=fit_null_gamma();
00195 result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
00196 break;
00197 }
00198
00199 default:
00200 result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00201 break;
00202 }
00203
00204 return result;
00205 }
00206
00207 float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
00208 {
00209 float64_t result=0;
00210
00211 switch (m_null_approximation_method)
00212 {
00213 case MMD2_SPECTRUM:
00214 {
00215 #ifdef HAVE_LAPACK
00216
00217 SGVector<float64_t> null_samples=sample_null_spectrum(
00218 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00219 CMath::qsort(null_samples);
00220 result=null_samples[CMath::floor(null_samples.vlen*(1-alpha))];
00221 #else // HAVE_LAPACK
00222 SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if "
00223 "shogun is compiled with LAPACK enabled\n");
00224 #endif // HAVE_LAPACK
00225 break;
00226 }
00227
00228 case MMD2_GAMMA:
00229 {
00230
00231 SGVector<float64_t> params=fit_null_gamma();
00232 result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
00233 break;
00234 }
00235
00236 default:
00237
00238 result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00239 break;
00240 }
00241
00242 return result;
00243 }
00244
00245
00246 #ifdef HAVE_LAPACK
00247 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
00248 index_t num_eigenvalues)
00249 {
00250 if (m_m!=m_p_and_q->get_num_vectors()/2)
00251 {
00252 SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
00253 "sample sizes are supported\n", get_name());
00254 }
00255
00256 if (num_samples<=2)
00257 {
00258 SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at"
00259 " least 2, better in the hundrets", get_name());
00260 }
00261
00262 if (num_eigenvalues>2*m_m-1)
00263 {
00264 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
00265 get_name());
00266 }
00267
00268 if (num_eigenvalues<1)
00269 {
00270 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n",
00271 get_name());
00272 }
00273
00274
00275 if (m_statistic_type!=BIASED)
00276 {
00277 SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
00278 "to be BIASED. Please ensure that! To get rid of warning,"
00279 "call %s::set_statistic_type(BIASED)\n", get_name(),
00280 get_name());
00281 }
00282
00283
00284
00285
00286 m_kernel->init(m_p_and_q, m_p_and_q);
00287 SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();
00288
00289
00290 K.center();
00291
00292
00293 SGVector<float64_t> eigenvalues=
00294 SGMatrix<float64_t>::compute_eigenvectors(K);
00295 SGVector<float64_t> largest_ev(num_eigenvalues);
00296
00297
00298 for (index_t i=0; i<num_eigenvalues; ++i)
00299 largest_ev[i]=CMath::abs(
00300 1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
00301
00302
00303 SGVector<float64_t> null_samples(num_samples);
00304 for (index_t i=0; i<num_samples; ++i)
00305 {
00306
00307 null_samples[i]=0;
00308 for (index_t j=0; j<largest_ev.vlen; ++j)
00309 null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
00310
00311 null_samples[i]*=2;
00312 }
00313
00314 return null_samples;
00315 }
00316 #endif // HAVE_LAPACK
00317
00318 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma()
00319 {
00320 SG_WARNING("CQuadraticTimeMMD::fit_null_gamma(): TODO: combine with "
00321 "compute_statistic to be more efficient!\n");
00322
00323 if (m_m!=m_p_and_q->get_num_vectors()/2)
00324 {
00325 SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal "
00326 "sample sizes are supported\n", get_name());
00327 }
00328
00329
00330 if (m_statistic_type!=BIASED)
00331 {
00332 SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
00333 "to be BIASED. Please ensure that! To get rid of warning,"
00334 "call %s::set_statistic_type(BIASED)\n", get_name(),
00335 get_name());
00336 }
00337
00338
00339
00340
00341 m_kernel->init(m_p_and_q, m_p_and_q);
00342
00343
00344
00345
00346
00347 float64_t mean_mmd=0;
00348 for (index_t i=0; i<m_m; ++i)
00349 {
00350
00351
00352 mean_mmd+=m_kernel->kernel(i, m_m+i);
00353 }
00354 mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd);
00355
00356
00357
00358
00359 float64_t var_mmd=0;
00360 for (index_t i=0; i<m_m; ++i)
00361 {
00362 for (index_t j=0; j<m_m; ++j)
00363 {
00364
00365 if (i==j || m_m+i==j || m_m+j==i)
00366 continue;
00367
00368 float64_t to_add=m_kernel->kernel(i, j);
00369 to_add+=m_kernel->kernel(m_m+i, m_m+j);
00370 to_add-=m_kernel->kernel(i, m_m+j);
00371 to_add-=m_kernel->kernel(m_m+i, j);
00372 var_mmd+=CMath::pow(to_add, 2);
00373 }
00374 }
00375 var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1);
00376
00377
00378 float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
00379 float64_t b=var_mmd*m_m / mean_mmd;
00380
00381 SGVector<float64_t> result(2);
00382 result[0]=a;
00383 result[1]=b;
00384
00385 return result;
00386 }
00387
00388 void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t
00389 num_samples_spectrum)
00390 {
00391 m_num_samples_spectrum=num_samples_spectrum;
00392 }
00393
00394 void CQuadraticTimeMMD::set_num_eigenvalues_spectrum(
00395 index_t num_eigenvalues_spectrum)
00396 {
00397 m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
00398 }
00399
00400 void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType
00401 statistic_type)
00402 {
00403 m_statistic_type=statistic_type;
00404 }