SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QuadraticTimeMMD.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2012 Heiko Strathmann
8  */
9 
13 #include <shogun/kernel/Kernel.h>
14 
15 using namespace shogun;
16 
18 {
19  init();
20 }
21 
23  index_t q_start) :
24  CKernelTwoSampleTestStatistic(kernel, p_and_q, q_start)
25 {
26  init();
27 
28  if (p_and_q && q_start!=p_and_q->get_num_vectors()/2)
29  {
30  SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
31  "are currently possible\n");
32  }
33 }
34 
36  CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q)
37 {
38  init();
39 
40  if (p && q && p->get_num_vectors()!=q->get_num_vectors())
41  {
42  SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
43  "are currently possible\n");
44  }
45 }
46 
48 {
49 
50 }
51 
52 void CQuadraticTimeMMD::init()
53 {
54  SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
55  " for spectrum method null-distribution approximation",
57  SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
58  " Eigenvalues for spectrum method null-distribution approximation",
60  SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
61  "Biased or unbiased MMD", MS_NOT_AVAILABLE);
62 
66 }
67 
69 {
70  /* split computations into three terms from JLMR paper (see documentation )*/
72 
73  /* init kernel with features */
75 
76  /* first term */
77  float64_t first=0;
78  for (index_t i=0; i<m; ++i)
79  {
80  /* ensure i!=j while adding up */
81  for (index_t j=0; j<m; ++j)
82  first+=i==j ? 0 : m_kernel->kernel(i,j);
83  }
84  first/=(m-1);
85 
86  /* second term */
87  float64_t second=0;
88  for (index_t i=m_q_start; i<m_q_start+m; ++i)
89  {
90  /* ensure i!=j while adding up */
91  for (index_t j=m_q_start; j<m_q_start+m; ++j)
92  second+=i==j ? 0 : m_kernel->kernel(i,j);
93  }
94  second/=(m-1);
95 
96  /* third term */
97  float64_t third=0;
98  for (index_t i=0; i<m; ++i)
99  {
100  for (index_t j=m_q_start; j<m_q_start+m; ++j)
101  third+=m_kernel->kernel(i,j);
102  }
103  third*=2.0/m;
104 
105  return first+second-third;
106 }
107 
109 {
110  /* split computations into three terms from JLMR paper (see documentation )*/
111  index_t m=m_q_start;
112 
113  /* init kernel with features */
115 
116  /* first term */
117  float64_t first=0;
118  for (index_t i=0; i<m; ++i)
119  {
120  for (index_t j=0; j<m; ++j)
121  first+=m_kernel->kernel(i,j);
122  }
123  first/=m;
124 
125  /* second term */
126  float64_t second=0;
127  for (index_t i=m_q_start; i<m_q_start+m; ++i)
128  {
129  for (index_t j=m_q_start; j<m_q_start+m; ++j)
130  second+=m_kernel->kernel(i,j);
131  }
132  second/=m;
133 
134  /* third term */
135  float64_t third=0;
136  for (index_t i=0; i<m; ++i)
137  {
138  for (index_t j=m_q_start; j<m_q_start+m; ++j)
139  third+=m_kernel->kernel(i,j);
140  }
141  third*=2.0/m;
142 
143  return first+second-third;
144 }
145 
147 {
148  if (!m_kernel)
149  SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name());
150 
151  float64_t result=0;
152  switch (m_statistic_type)
153  {
154  case UNBIASED:
156  break;
157  case BIASED:
158  result=compute_biased_statistic();
159  break;
160  default:
161  SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic "
162  "type!\n");
163  break;
164  }
165 
166  return result;
167 }
168 
170 {
171  float64_t result=0;
172 
174  {
175  case MMD2_SPECTRUM:
176  {
177 #ifdef HAVE_LAPACK
178  /* get samples from null-distribution and compute p-value of statistic */
181  CMath::qsort(null_samples);
182  index_t pos=CMath::find_position_to_insert(null_samples, statistic);
183  result=1.0-((float64_t)pos)/null_samples.vlen;
184 #else // HAVE_LAPACK
185  SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
186  "shogun is compiled with LAPACK enabled\n");
187 #endif // HAVE_LAPACK
188  break;
189  }
190 
191  case MMD2_GAMMA:
192  {
193  /* fit gamma and return cdf at statistic */
195  result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
196  break;
197  }
198 
199  default:
201  break;
202  }
203 
204  return result;
205 }
206 
208 {
209  float64_t result=0;
210 
212  {
213  case MMD2_SPECTRUM:
214  {
215 #ifdef HAVE_LAPACK
216  /* get samples from null-distribution and compute threshold */
219  CMath::qsort(null_samples);
220  result=null_samples[CMath::floor(null_samples.vlen*(1-alpha))];
221 #else // HAVE_LAPACK
222  SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if "
223  "shogun is compiled with LAPACK enabled\n");
224 #endif // HAVE_LAPACK
225  break;
226  }
227 
228  case MMD2_GAMMA:
229  {
230  /* fit gamma and return inverse cdf at alpha */
232  result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
233  break;
234  }
235 
236  default:
237  /* bootstrapping is handled here */
239  break;
240  }
241 
242  return result;
243 }
244 
245 
246 #ifdef HAVE_LAPACK
248  index_t num_eigenvalues)
249 {
251  {
252  SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
253  "sample sizes are supported\n", get_name());
254  }
255 
256  if (num_samples<=2)
257  {
258  SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at"
259  " least 2, better in the hundrets", get_name());
260  }
261 
262  if (num_eigenvalues>2*m_q_start-1)
263  {
264  SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
265  get_name());
266  }
267 
268  if (num_eigenvalues<1)
269  {
270  SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n",
271  get_name());
272  }
273 
274  /* evtl. warn user not to use wrong statistic type */
276  {
277  SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
278  "to be BIASED. Please ensure that! To get rid of warning,"
279  "call %s::set_statistic_type(BIASED)\n", get_name(),
280  get_name());
281  }
282 
283  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
284  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
285  * works since X and Y are concatenated here */
288 
289  /* center matrix K=H*K*H */
290  K.center();
291 
292  /* compute eigenvalues and select num_eigenvalues largest ones */
293  SGVector<float64_t> eigenvalues=
295  SGVector<float64_t> largest_ev(num_eigenvalues);
296 
297  /* take largest EV, scale by 1/2/m on the fly and take abs value*/
298  for (index_t i=0; i<num_eigenvalues; ++i)
299  largest_ev[i]=CMath::abs(
300  1.0/2/m_q_start*eigenvalues[eigenvalues.vlen-1-i]);
301 
302  /* finally, sample from null distribution */
303  SGVector<float64_t> null_samples(num_samples);
304  for (index_t i=0; i<num_samples; ++i)
305  {
306  /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
307  null_samples[i]=0;
308  for (index_t j=0; j<largest_ev.vlen; ++j)
309  null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
310 
311  null_samples[i]*=2;
312  }
313 
314  return null_samples;
315 }
316 #endif // HAVE_LAPACK
317 
319 {
321  {
322  SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal "
323  "sample sizes are supported\n", get_name());
324  }
325 
326  /* evtl. warn user not to use wrong statistic type */
328  {
329  SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
330  "to be BIASED. Please ensure that! To get rid of warning,"
331  "call %s::set_statistic_type(BIASED)\n", get_name(),
332  get_name());
333  }
334 
335  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
336  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
337  * works since X and Y are concatenated here */
339 
340  /* compute mean under H0 of MMD, which is
341  * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) );
342  * in MATLAB.
343  * Remove diagonals on the fly */
344  float64_t mean_mmd=0;
345  for (index_t i=0; i<m_q_start; ++i)
346  {
347  /* virtual KL matrix is in upper right corner of SHOGUN K matrix
348  * so this sums the diagonal of the matrix between X and Y*/
349  mean_mmd+=m_kernel->kernel(i, m_q_start+i);
350  }
351  mean_mmd=2.0/m_q_start*(1.0-1.0/m_q_start*mean_mmd);
352 
353  /* compute variance under H0 of MMD, which is
354  * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
355  * in MATLAB, so sum up all elements */
356  float64_t var_mmd=0;
357  for (index_t i=0; i<m_q_start; ++i)
358  {
359  for (index_t j=0; j<m_q_start; ++j)
360  {
361  /* dont add diagonal of all pairs of imaginary kernel matrices */
362  if (i==j || m_q_start+i==j || m_q_start+j==i)
363  continue;
364 
365  float64_t to_add=m_kernel->kernel(i, j);
366  to_add+=m_kernel->kernel(m_q_start+i, m_q_start+j);
367  to_add-=m_kernel->kernel(i, m_q_start+j);
368  to_add-=m_kernel->kernel(m_q_start+i, j);
369  var_mmd+=CMath::pow(to_add, 2);
370  }
371  }
372  var_mmd*=2.0/m_q_start/(m_q_start-1)*1.0/m_q_start/(m_q_start-1);
373 
374  /* parameters for gamma distribution */
375  float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
376  float64_t b=var_mmd*m_q_start / mean_mmd;
377 
378  SGVector<float64_t> result(2);
379  result[0]=a;
380  result[1]=b;
381 
382  return result;
383 }
384 
386  num_samples_spectrum)
387 {
388  m_num_samples_spectrum=num_samples_spectrum;
389 }
390 
392  index_t num_eigenvalues_spectrum)
393 {
394  m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
395 }
396 
398  statistic_type)
399 {
400  m_statistic_type=statistic_type;
401 }

SHOGUN Machine Learning Toolbox - Documentation