00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Heiko Strathmann 00008 */ 00009 00010 #ifndef __QUADRACTIMEMMD_H_ 00011 #define __QUADRACTIMEMMD_H_ 00012 00013 #include <shogun/statistics/KernelTwoSampleTestStatistic.h> 00014 00015 namespace shogun 00016 { 00017 00018 class CFeatures; 00019 class CKernel; 00020 00022 enum EQuadraticMMDType 00023 { 00024 BIASED, UNBIASED 00025 }; 00026 00084 class CQuadraticTimeMMD : public CKernelTwoSampleTestStatistic 00085 { 00086 public: 00087 CQuadraticTimeMMD(); 00088 00099 CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q, index_t m); 00100 00112 CQuadraticTimeMMD(CKernel* kernel, CFeatures* p, CFeatures* q); 00113 00114 virtual ~CQuadraticTimeMMD(); 00115 00122 virtual float64_t compute_statistic(); 00123 00135 virtual float64_t compute_p_value(float64_t statistic); 00136 00147 virtual float64_t compute_threshold(float64_t alpha); 00148 00149 virtual const char* get_name() const 00150 { 00151 return "QuadraticTimeMMD"; 00152 }; 00153 00154 #ifdef HAVE_LAPACK 00155 /* returns a set of samples of an estimate of the null distribution 00156 * using the Eigen-spectrum of the centered kernel matrix of the merged 00157 * samples of p and q. May be used to compute p_value (easy) 00158 * 00159 * kernel matrix needs to be stored in memory 00160 * 00161 * Note that the provided statistic HAS to be the biased version 00162 * (see paper for details). Note that m*Null-distribution is returned, 00163 * which is fine since the statistic is also m*MMD: 00164 * 00165 * Works well if the kernel matrix is NOT diagonal dominant. 00166 * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011). 00167 * A fast, consistent kernel two-sample test. 00168 * 00169 * @param num_samples number of samples to draw 00170 * @param num_eigenvalues number of eigenvalues to use to draw samples 00171 * Maximum number of 2m-1 where m is the size of both sets of samples. 00172 * It is usually safe to use a smaller number since they decay very 00173 * fast, however, a conservative approach would be to use all (-1 does 00174 * this). See paper for details. 00175 * @return samples from the estimated null distribution 00176 */ 00177 SGVector<float64_t> sample_null_spectrum(index_t num_samples, 00178 index_t num_eigenvalues); 00179 #endif // HAVE_LAPACK 00180 00187 void set_num_samples_sepctrum(index_t num_samples_spectrum); 00188 00195 void set_num_eigenvalues_spectrum(index_t num_eigenvalues_spectrum); 00196 00198 void set_statistic_type(EQuadraticMMDType statistic_type); 00199 00220 SGVector<float64_t> fit_null_gamma(); 00221 00222 protected: 00224 virtual float64_t compute_unbiased_statistic(); 00225 00227 virtual float64_t compute_biased_statistic(); 00228 00229 private: 00230 void init(); 00231 00232 protected: 00234 index_t m_num_samples_spectrum; 00235 00237 index_t m_num_eigenvalues_spectrum; 00238 00240 EQuadraticMMDType m_statistic_type; 00241 }; 00242 00243 } 00244 00245 #endif /* __QUADRACTIMEMMD_H_ */