00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <shogun/statistics/HSIC.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/mathematics/Statistics.h>
00013 #include <shogun/kernel/Kernel.h>
00014 #include <shogun/kernel/CustomKernel.h>
00015
00016 using namespace shogun;
00017
00018 CHSIC::CHSIC() :
00019 CKernelIndependenceTestStatistic()
00020 {
00021 init();
00022 }
00023
00024 CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p_and_q,
00025 index_t m) :
00026 CKernelIndependenceTestStatistic(kernel_p, kernel_q, m_p_and_q, m)
00027 {
00028 if (p_and_q && p_and_q->get_num_vectors()/2!=m)
00029 {
00030 SG_ERROR("%s: Only features with equal number of vectors are currently "
00031 "possible\n", get_name());
00032 }
00033
00034 init();
00035 }
00036
00037 CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p,
00038 CFeatures* q) :
00039 CKernelIndependenceTestStatistic(kernel_p, kernel_q, p, q)
00040 {
00041 if (p && q && p->get_num_vectors()!=q->get_num_vectors())
00042 {
00043 SG_ERROR("%s: Only features with equal number of vectors are currently "
00044 "possible\n", get_name());
00045 }
00046
00047 init();
00048 }
00049
00050 CHSIC::~CHSIC()
00051 {
00052
00053 }
00054
00055 void CHSIC::init()
00056 {
00057
00058 }
00059
00060 float64_t CHSIC::compute_statistic()
00061 {
00062 REQUIRE(m_kernel_p && m_kernel_q, "%s::fit_null_gamma(): No or only one "
00063 "kernel specified!\n", get_name());
00064
00065 REQUIRE(m_p_and_q, "%s::compute_statistic: features needed!\n", get_name());
00066
00067
00068 SGMatrix<float64_t> K=get_kernel_matrix_K();
00069 SGMatrix<float64_t> L=get_kernel_matrix_L();
00070
00071
00072 K.center();
00073
00074
00075 index_t m=m_m;
00076 float64_t result=0;
00077 for (index_t i=0; i<m; ++i)
00078 {
00079 for (index_t j=0; j<m; ++j)
00080 result+=K(j, i)*L(i, j);
00081 }
00082
00083
00084 result/=m;
00085
00086 return result;
00087 }
00088
00089 float64_t CHSIC::compute_p_value(float64_t statistic)
00090 {
00091 float64_t result=0;
00092 switch (m_null_approximation_method)
00093 {
00094 case HSIC_GAMMA:
00095 {
00096
00097 SGVector<float64_t> params=fit_null_gamma();
00098 result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
00099 break;
00100 }
00101
00102 default:
00103
00104 result=CTwoDistributionsTestStatistic::compute_p_value(statistic);
00105 break;
00106 }
00107
00108 return result;
00109 }
00110
00111 float64_t CHSIC::compute_threshold(float64_t alpha)
00112 {
00113 float64_t result=0;
00114 switch (m_null_approximation_method)
00115 {
00116 case HSIC_GAMMA:
00117 {
00118
00119 SGVector<float64_t> params=fit_null_gamma();
00120 result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
00121 break;
00122 }
00123
00124 default:
00125
00126 result=CTwoDistributionsTestStatistic::compute_threshold(alpha);
00127 break;
00128 }
00129
00130 return result;
00131 }
00132
00133 SGVector<float64_t> CHSIC::fit_null_gamma()
00134 {
00135 REQUIRE(m_kernel_p && m_kernel_q, "%s::fit_null_gamma(): No or only one "
00136 "kernel specified!\n", get_name());
00137
00138 REQUIRE(m_p_and_q, "%s::fit_null_gamma: features needed!\n", get_name());
00139
00140 index_t m=m_m;
00141
00142
00143 SGMatrix<float64_t> K=get_kernel_matrix_K();
00144 SGMatrix<float64_t> L=get_kernel_matrix_L();
00145
00146
00147 float64_t trace_K=0;
00148 float64_t trace_L=0;
00149 float64_t sum_K=0;
00150 float64_t sum_L=0;
00151 for (index_t i=0; i<m; ++i)
00152 {
00153 trace_K+=K(i,i);
00154 trace_L+=L(i,i);
00155 for (index_t j=0; j<m; ++j)
00156 {
00157 sum_K+=K(i,j);
00158 sum_L+=L(i,j);
00159 }
00160 }
00161 SG_DEBUG("sum_K: %f, sum_L: %f, trace_K: %f, trace_L: %f\n", sum_K, sum_L,
00162 trace_K, trace_L);
00163
00164
00165 K.center();
00166 L.center();
00167
00168
00169 float64_t trace=0;
00170 for (index_t i=0; i<m; ++i)
00171 trace+=CMath::pow(K(i,i)*L(i,i), 2);
00172
00173 trace/=36.0;
00174 SG_DEBUG("trace %f\n", trace);
00175
00176
00177 float64_t sum=0;
00178 for (index_t i=0; i<m; ++i)
00179 {
00180 for (index_t j=0; j<m; ++j)
00181 sum+=CMath::pow(K(i,j)*L(i,j), 2);
00182 }
00183 sum/=36.0;
00184 SG_DEBUG("sum %f\n", sum);
00185
00186
00187
00188 float64_t var_hsic=1.0/m/(m-1)*(sum-trace);
00189 SG_DEBUG("1.0/m/(m-1)*(sum-trace): %f\n", var_hsic);
00190
00191
00192
00193 var_hsic=72.0*(m-4)*(m-5)/m/(m-1)/(m-2)/(m-3)*var_hsic;
00194 SG_DEBUG("var_hsic: %f\n", var_hsic);
00195
00196
00197
00198 float64_t mu_x=1.0/m/(m-1)*(sum_K-trace_K);
00199 float64_t mu_y=1.0/m/(m-1)*(sum_L-trace_L);
00200 SG_DEBUG("mu_x: %f, mu_y: %f\n", mu_x, mu_y);
00201
00202
00203 float64_t m_hsic=1.0/m*(1+mu_x*mu_y-mu_x-mu_y);
00204 SG_DEBUG("m_hsic: %f\n", m_hsic);
00205
00206
00207 float64_t a=CMath::pow(m_hsic, 2)/var_hsic;
00208 float64_t b=var_hsic*m/m_hsic;
00209 SG_DEBUG("a: %f, b: %f\n", a, b);
00210
00211 SGVector<float64_t> result(2);
00212 result[0]=a;
00213 result[1]=b;
00214
00215 SG_DEBUG(("leaving %s::fit_null_gamma()\n"), get_name());
00216 return result;
00217 }
00218
00219 SGMatrix<float64_t> CHSIC::get_kernel_matrix_K()
00220 {
00221 SGMatrix<float64_t> K;
00222
00223
00224 SGVector<index_t> subset(m_m);
00225 subset.range_fill();
00226
00227
00228 if (m_kernel_p->get_kernel_type()==K_CUSTOM)
00229 {
00230
00231 CCustomKernel* custom_kernel_p=(CCustomKernel*)m_kernel_p;
00232 custom_kernel_p->add_row_subset(subset);
00233 custom_kernel_p->add_col_subset(subset);
00234
00235 K=custom_kernel_p->get_kernel_matrix();
00236
00237 custom_kernel_p->remove_row_subset();
00238 custom_kernel_p->remove_col_subset();
00239 }
00240 else
00241 {
00242 m_p_and_q->add_subset(subset);
00243 m_kernel_p->init(m_p_and_q, m_p_and_q);
00244 K=m_kernel_p->get_kernel_matrix();
00245 m_p_and_q->remove_subset();
00246
00247
00248 m_kernel_p->init(m_p_and_q, m_p_and_q);
00249 }
00250
00251 return K;
00252 }
00253
00254 SGMatrix<float64_t> CHSIC::get_kernel_matrix_L()
00255 {
00256 SGMatrix<float64_t> L;
00257
00258
00259 SGVector<index_t> subset(m_m);
00260 subset.range_fill();
00261 subset.add(m_m);
00262
00263
00264 if (m_kernel_q->get_kernel_type()==K_CUSTOM)
00265 {
00266
00267 CCustomKernel* custom_kernel_q=(CCustomKernel*)m_kernel_q;
00268 custom_kernel_q->add_row_subset(subset);
00269 custom_kernel_q->add_col_subset(subset);
00270
00271 L=custom_kernel_q->get_kernel_matrix();
00272
00273 custom_kernel_q->remove_row_subset();
00274 custom_kernel_q->remove_col_subset();
00275 }
00276 else
00277 {
00278 m_p_and_q->add_subset(subset);
00279 m_kernel_q->init(m_p_and_q, m_p_and_q);
00280 L=m_kernel_q->get_kernel_matrix();
00281 m_p_and_q->remove_subset();
00282
00283
00284 m_kernel_q->init(m_p_and_q, m_p_and_q);
00285 }
00286
00287 return L;
00288 }
00289
00290 SGVector<float64_t> CHSIC::bootstrap_null()
00291 {
00292 SG_DEBUG("entering CHSIC::bootstrap_null()\n");
00293
00294
00295
00296
00297
00298 CKernel* kernel_p=m_kernel_p;
00299 CKernel* kernel_q=m_kernel_q;
00300
00301
00302 m_kernel_p->init(m_p_and_q, m_p_and_q);
00303 m_kernel_q->init(m_p_and_q, m_p_and_q);
00304
00305
00306 CCustomKernel* precomputed_p=new CCustomKernel(m_kernel_p);
00307 CCustomKernel* precomputed_q=new CCustomKernel(m_kernel_q);
00308 SG_REF(precomputed_p);
00309 SG_REF(precomputed_q);
00310
00311
00312 m_kernel_p=precomputed_p;
00313 m_kernel_q=precomputed_q;
00314
00315
00316 SGVector<float64_t> null_samples=
00317 CKernelIndependenceTestStatistic::bootstrap_null();
00318
00319
00320 m_kernel_p=kernel_p;
00321 m_kernel_q=kernel_q;
00322
00323 SG_UNREF(precomputed_p);
00324 SG_UNREF(precomputed_q);
00325
00326
00327 SG_DEBUG("leaving CHSIC::bootstrap_null()\n");
00328 return null_samples;
00329 }