TwoDistributionsTestStatistic.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/TwoDistributionsTestStatistic.h>
00011 #include <shogun/features/Features.h>
00012 
00013 using namespace shogun;
00014 
00015 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic() :
00016         CTestStatistic()
00017 {
00018     init();
00019 }
00020 
00021 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00022         CFeatures* p_and_q,
00023         index_t m) : CTestStatistic()
00024 {
00025     init();
00026 
00027     m_p_and_q=p_and_q;
00028     SG_REF(m_p_and_q);
00029 
00030     m_m=m;
00031 }
00032 
00033 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00034         CFeatures* p, CFeatures* q) :
00035         CTestStatistic()
00036 {
00037     init();
00038 
00039     m_p_and_q=p->create_merged_copy(q);
00040     SG_REF(m_p_and_q);
00041 
00042     m_m=p->get_num_vectors();
00043 }
00044 
00045 CTwoDistributionsTestStatistic::~CTwoDistributionsTestStatistic()
00046 {
00047     SG_UNREF(m_p_and_q);
00048 }
00049 
00050 void CTwoDistributionsTestStatistic::init()
00051 {
00052     SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q",
00053             MS_NOT_AVAILABLE);
00054     SG_ADD(&m_m, "m", "Index of first sample of q",
00055             MS_NOT_AVAILABLE);
00056 
00057     m_p_and_q=NULL;
00058     m_m=0;
00059 }
00060 
00061 SGVector<float64_t> CTwoDistributionsTestStatistic::bootstrap_null()
00062 {
00063     SG_DEBUG("entering CTwoDistributionsTestStatistic::bootstrap_null()\n");
00064 
00065     REQUIRE(m_p_and_q, "CTwoDistributionsTestStatistic::bootstrap_null(): "
00066             "No appended features p and q!\n");
00067 
00068     /* compute bootstrap statistics for null distribution */
00069     SGVector<float64_t> results(m_bootstrap_iterations);
00070 
00071     /* memory for index permutations, (would slow down loop) */
00072     SGVector<index_t> ind_permutation(2*m_m);
00073     ind_permutation.range_fill();
00074     m_p_and_q->add_subset(ind_permutation);
00075 
00076     for (index_t i=0; i<m_bootstrap_iterations; ++i)
00077     {
00078         /* idea: merge features of p and q, shuffle, and compute statistic.
00079          * This is done using subsets here */
00080 
00081         /* create index permutation and add as subset. This will mix samples
00082          * from p and q */
00083         SGVector<int32_t>::permute_vector(ind_permutation);
00084 
00085         /* compute statistic for this permutation of mixed samples */
00086         results[i]=compute_statistic();
00087     }
00088 
00089     /* clean up */
00090     m_p_and_q->remove_subset();
00091 
00092     SG_DEBUG("leaving CTwoDistributionsTestStatistic::bootstrap_null()\n");
00093     return results;
00094 }
00095 
00096 float64_t CTwoDistributionsTestStatistic::compute_p_value(
00097         float64_t statistic)
00098 {
00099     float64_t result=0;
00100 
00101     if (m_null_approximation_method==BOOTSTRAP)
00102     {
00103         /* bootstrap a bunch of MMD values from null distribution */
00104         SGVector<float64_t> values=bootstrap_null();
00105 
00106         /* find out percentile of parameter "statistic" in null distribution */
00107         CMath::qsort(values);
00108         float64_t i=CMath::find_position_to_insert(values, statistic);
00109 
00110         /* return corresponding p-value */
00111         result=1.0-i/values.vlen;
00112     }
00113     else
00114     {
00115         SG_ERROR("CTwoDistributionsTestStatistics::compute_p_value(): Unknown"
00116                 "method to approximate null distribution!\n");
00117     }
00118 
00119     return result;
00120 }
00121 
00122 float64_t CTwoDistributionsTestStatistic::compute_threshold(
00123         float64_t alpha)
00124 {
00125     float64_t result=0;
00126 
00127     if (m_null_approximation_method==BOOTSTRAP)
00128     {
00129         /* bootstrap a bunch of MMD values from null distribution */
00130         SGVector<float64_t> values=bootstrap_null();
00131 
00132         /* return value of (1-alpha) quantile */
00133         result=values[CMath::floor(values.vlen*(1-alpha))];
00134     }
00135     else
00136     {
00137         SG_ERROR("CTwoDistributionsTestStatistics::compute_threshold():"
00138                 "Unknown method to approximate null distribution!\n");
00139     }
00140 
00141     return result;
00142 }
00143 
00144 void CTwoDistributionsTestStatistic::set_p_and_q(CFeatures* p_and_q)
00145 {
00146     /* ref before unref to avoid problems when instances are equal */
00147     SG_REF(p_and_q);
00148     SG_UNREF(m_p_and_q);
00149     m_p_and_q=p_and_q;
00150 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation