SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QuadraticTimeMMD.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2012-2013 Heiko Strathmann
8  */
9 
13 #include <shogun/kernel/Kernel.h>
16 
17 using namespace shogun;
18 
20 {
21  init();
22 }
23 
25  index_t m) :
26  CKernelTwoSampleTestStatistic(kernel, p_and_q, m)
27 {
28  init();
29 
30  if (p_and_q && m!=p_and_q->get_num_vectors()/2)
31  {
32  SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
33  "are currently possible\n");
34  }
35 }
36 
38  CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q)
39 {
40  init();
41 
42  if (p && q && p->get_num_vectors()!=q->get_num_vectors())
43  {
44  SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
45  "are currently possible\n");
46  }
47 }
48 
50  CKernelTwoSampleTestStatistic(custom_kernel, NULL, m)
51 {
52  init();
53 }
54 
56 {
57 
58 }
59 
60 void CQuadraticTimeMMD::init()
61 {
62  SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
63  " for spectrum method null-distribution approximation",
65  SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
66  " Eigenvalues for spectrum method null-distribution approximation",
68  SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
69  "Biased or unbiased MMD", MS_NOT_AVAILABLE);
70 
74 }
75 
77 {
78  /* split computations into three terms from JLMR paper (see documentation )*/
79  index_t m=m_m;
80 
81  /* init kernel with features */
83 
84  /* first term */
85  float64_t first=0;
86  for (index_t i=0; i<m; ++i)
87  {
88  /* ensure i!=j while adding up */
89  for (index_t j=0; j<m; ++j)
90  first+=i==j ? 0 : m_kernel->kernel(i,j);
91  }
92  first/=(m-1);
93 
94  /* second term */
95  float64_t second=0;
96  for (index_t i=m_m; i<m_m+m; ++i)
97  {
98  /* ensure i!=j while adding up */
99  for (index_t j=m_m; j<m_m+m; ++j)
100  second+=i==j ? 0 : m_kernel->kernel(i,j);
101  }
102  second/=(m-1);
103 
104  /* third term */
105  float64_t third=0;
106  for (index_t i=0; i<m; ++i)
107  {
108  for (index_t j=m_m; j<m_m+m; ++j)
109  third+=m_kernel->kernel(i,j);
110  }
111  third*=2.0/m;
112 
113  return first+second-third;
114 }
115 
117 {
118  /* split computations into three terms from JLMR paper (see documentation )*/
119  index_t m=m_m;
120 
121  /* init kernel with features */
123 
124  /* first term */
125  float64_t first=0;
126  for (index_t i=0; i<m; ++i)
127  {
128  for (index_t j=0; j<m; ++j)
129  first+=m_kernel->kernel(i,j);
130  }
131  first/=m;
132 
133  /* second term */
134  float64_t second=0;
135  for (index_t i=m_m; i<m_m+m; ++i)
136  {
137  for (index_t j=m_m; j<m_m+m; ++j)
138  second+=m_kernel->kernel(i,j);
139  }
140  second/=m;
141 
142  /* third term */
143  float64_t third=0;
144  for (index_t i=0; i<m; ++i)
145  {
146  for (index_t j=m_m; j<m_m+m; ++j)
147  third+=m_kernel->kernel(i,j);
148  }
149  third*=2.0/m;
150 
151  return first+second-third;
152 }
153 
155 {
156  if (!m_kernel)
157  SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name())
158 
159  float64_t result=0;
160  switch (m_statistic_type)
161  {
162  case UNBIASED:
164  break;
165  case BIASED:
166  result=compute_biased_statistic();
167  break;
168  default:
169  SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic "
170  "type!\n");
171  break;
172  }
173 
174  return result;
175 }
176 
178 {
179  float64_t result=0;
180 
182  {
183  case MMD2_SPECTRUM:
184  {
185 #ifdef HAVE_LAPACK
186  /* get samples from null-distribution and compute p-value of statistic */
189  null_samples.qsort();
190  index_t pos=null_samples.find_position_to_insert(statistic);
191  result=1.0-((float64_t)pos)/null_samples.vlen;
192 #else // HAVE_LAPACK
193  SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
194  "shogun is compiled with LAPACK enabled\n");
195 #endif // HAVE_LAPACK
196  break;
197  }
198 
199  case MMD2_GAMMA:
200  {
201  /* fit gamma and return cdf at statistic */
203  result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
204  break;
205  }
206 
207  default:
209  break;
210  }
211 
212  return result;
213 }
214 
216  bool multiple_kernels)
217 {
218  SGVector<float64_t> mmds;
219  if (!multiple_kernels)
220  {
221  /* just one mmd result */
222  mmds=SGVector<float64_t>(1);
223  mmds[0]=compute_statistic();
224  }
225  else
226  {
228  "%s::compute_statistic: multiple kernels specified,"
229  "but underlying kernel is not of type K_COMBINED\n", get_name());
230 
231  /* cast and allocate memory for results */
233  SG_REF(combined);
234  mmds=SGVector<float64_t>(combined->get_num_subkernels());
235 
236  /* iterate through all kernels and compute statistic */
237  /* TODO this might be done in parallel */
238  for (index_t i=0; i<mmds.vlen; ++i)
239  {
240  CKernel* current=combined->get_kernel(i);
241  /* temporarily replace underlying kernel and compute statistic */
242  m_kernel=current;
243  mmds[i]=compute_statistic();
244 
245  SG_UNREF(current);
246  }
247 
248  /* restore combined kernel */
249  m_kernel=combined;
250  SG_UNREF(combined);
251  }
252 
253  return mmds;
254 }
255 
257 {
258  float64_t result=0;
259 
261  {
262  case MMD2_SPECTRUM:
263  {
264 #ifdef HAVE_LAPACK
265  /* get samples from null-distribution and compute threshold */
268  null_samples.qsort();
269  result=null_samples[CMath::floor(null_samples.vlen*(1-alpha))];
270 #else // HAVE_LAPACK
271  SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if "
272  "shogun is compiled with LAPACK enabled\n");
273 #endif // HAVE_LAPACK
274  break;
275  }
276 
277  case MMD2_GAMMA:
278  {
279  /* fit gamma and return inverse cdf at alpha */
281  result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
282  break;
283  }
284 
285  default:
286  /* bootstrapping is handled here */
288  break;
289  }
290 
291  return result;
292 }
293 
294 
295 #ifdef HAVE_LAPACK
297  index_t num_eigenvalues)
298 {
299  REQUIRE(m_kernel, "%s::sample_null_spectrum(%d, %d): No kernel set!\n",
300  get_name(), num_samples, num_eigenvalues);
302  "%s::sample_null_spectrum(%d, %d): No features set and no custom "
303  "kernel in use!\n", get_name(), num_samples, num_eigenvalues);
304 
305  index_t num_data;
307  num_data=m_kernel->get_num_vec_rhs();
308  else
309  num_data=m_p_and_q->get_num_vectors();
310 
311  if (m_m!=num_data/2)
312  {
313  SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
314  "sample sizes are supported\n", get_name());
315  }
316 
317  if (num_samples<=2)
318  {
319  SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at"
320  " least 2, better in the hundreds", get_name());
321  }
322 
323  if (num_eigenvalues>2*m_m-1)
324  {
325  SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
326  get_name());
327  }
328 
329  if (num_eigenvalues<1)
330  {
331  SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n",
332  get_name());
333  }
334 
335  /* evtl. warn user not to use wrong statistic type */
337  {
338  SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
339  "to be BIASED. Please ensure that! To get rid of warning,"
340  "call %s::set_statistic_type(BIASED)\n", get_name(),
341  get_name());
342  }
343 
344  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
345  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
346  * works since X and Y are concatenated here */
349 
350  /* center matrix K=H*K*H */
351  K.center();
352 
353  /* compute eigenvalues and select num_eigenvalues largest ones */
354  SGVector<float64_t> eigenvalues=
356  SGVector<float64_t> largest_ev(num_eigenvalues);
357 
358  /* take largest EV, scale by 1/2/m on the fly and take abs value*/
359  for (index_t i=0; i<num_eigenvalues; ++i)
360  largest_ev[i]=CMath::abs(
361  1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
362 
363  /* finally, sample from null distribution */
364  SGVector<float64_t> null_samples(num_samples);
365  for (index_t i=0; i<num_samples; ++i)
366  {
367  /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
368  null_samples[i]=0;
369  for (index_t j=0; j<largest_ev.vlen; ++j)
370  null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
371 
372  null_samples[i]*=2;
373  }
374 
375  return null_samples;
376 }
377 #endif // HAVE_LAPACK
378 
380 {
381  REQUIRE(m_kernel, "%s::fit_null_gamma(): No kernel set!\n", get_name());
383  "%s::fit_null_gamma(): No features set and no custom kernel in"
384  " use!\n", get_name());
385 
386  index_t num_data;
388  num_data=m_kernel->get_num_vec_rhs();
389  else
390  num_data=m_p_and_q->get_num_vectors();
391 
392  if (m_m!=num_data/2)
393  {
394  SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal "
395  "sample sizes are supported\n", get_name());
396  }
397 
398  /* evtl. warn user not to use wrong statistic type */
400  {
401  SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
402  "to be BIASED. Please ensure that! To get rid of warning,"
403  "call %s::set_statistic_type(BIASED)\n", get_name(),
404  get_name());
405  }
406 
407  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
408  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
409  * works since X and Y are concatenated here */
411 
412  /* compute mean under H0 of MMD, which is
413  * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) );
414  * in MATLAB.
415  * Remove diagonals on the fly */
416  float64_t mean_mmd=0;
417  for (index_t i=0; i<m_m; ++i)
418  {
419  /* virtual KL matrix is in upper right corner of SHOGUN K matrix
420  * so this sums the diagonal of the matrix between X and Y*/
421  mean_mmd+=m_kernel->kernel(i, m_m+i);
422  }
423  mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd);
424 
425  /* compute variance under H0 of MMD, which is
426  * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
427  * in MATLAB, so sum up all elements */
428  float64_t var_mmd=0;
429  for (index_t i=0; i<m_m; ++i)
430  {
431  for (index_t j=0; j<m_m; ++j)
432  {
433  /* dont add diagonal of all pairs of imaginary kernel matrices */
434  if (i==j || m_m+i==j || m_m+j==i)
435  continue;
436 
437  float64_t to_add=m_kernel->kernel(i, j);
438  to_add+=m_kernel->kernel(m_m+i, m_m+j);
439  to_add-=m_kernel->kernel(i, m_m+j);
440  to_add-=m_kernel->kernel(m_m+i, j);
441  var_mmd+=CMath::pow(to_add, 2);
442  }
443  }
444  var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1);
445 
446  /* parameters for gamma distribution */
447  float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
448  float64_t b=var_mmd*m_m / mean_mmd;
449 
450  SGVector<float64_t> result(2);
451  result[0]=a;
452  result[1]=b;
453 
454  return result;
455 }
456 
458  num_samples_spectrum)
459 {
460  m_num_samples_spectrum=num_samples_spectrum;
461 }
462 
464  index_t num_eigenvalues_spectrum)
465 {
466  m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
467 }
468 
470  statistic_type)
471 {
472  m_statistic_type=statistic_type;
473 }

SHOGUN Machine Learning Toolbox - Documentation