SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
MMDKernelSelectionMedian.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Heiko Strathmann
8  */
9 
18 
19 
20 using namespace shogun;
21 
24 {
25  init();
26 }
27 
29  CKernelTwoSampleTest* mmd, index_t num_data_distance) :
31 {
32  /* assert that a combined kernel is used */
33  CKernel* kernel=mmd->get_kernel();
34  CFeatures* lhs=kernel->get_lhs();
35  CFeatures* rhs=kernel->get_rhs();
36  REQUIRE(kernel, "%s::%s(): No kernel set!\n", get_name(), get_name());
37  REQUIRE(kernel->get_kernel_type()==K_COMBINED, "%s::%s(): Requires "
38  "CombinedKernel as kernel. Yours is %s", get_name(), get_name(),
39  kernel->get_name());
40 
41  /* assert that all subkernels are Gaussian kernels */
42  CCombinedKernel* combined=(CCombinedKernel*)kernel;
43 
44  for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
45  {
46  CKernel* subkernel=combined->get_kernel(k_idx);
47  REQUIRE(kernel, "%s::%s(): Subkernel (%d) of current kernel is not"
48  " of type GaussianKernel\n", get_name(), get_name(), k_idx);
49  SG_UNREF(subkernel);
50  }
51 
52  /* assert 64 bit dense features since EuclideanDistance can only handle
53  * those */
55  {
56  CFeatures* features=((CQuadraticTimeMMD*)m_estimator)->get_p_and_q();
57  REQUIRE(features->get_feature_class()==C_DENSE &&
58  features->get_feature_type()==F_DREAL, "%s::select_kernel(): "
59  "Only 64 bit float dense features allowed, these are \"%s\""
60  " and of type %d\n",
61  get_name(), features->get_name(), features->get_feature_type());
62  SG_UNREF(features);
63  }
65  {
66  CStreamingFeatures* p=((CLinearTimeMMD*)m_estimator)->get_streaming_p();
67  CStreamingFeatures* q=((CLinearTimeMMD*)m_estimator)->get_streaming_q();
69  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
70  "Only 64 bit float streaming dense features allowed, these (p) "
71  "are \"%s\" and of type %d\n",
72  get_name(), p->get_name(), p->get_feature_type());
73 
75  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
76  "Only 64 bit float streaming dense features allowed, these (q) "
77  "are \"%s\" and of type %d\n",
78  get_name(), q->get_name(), q->get_feature_type());
79  SG_UNREF(p);
80  SG_UNREF(q);
81  }
82 
83  SG_UNREF(kernel);
84  SG_UNREF(lhs);
85  SG_UNREF(rhs);
86 
87  init();
88 
89  m_num_data_distance=num_data_distance;
90 }
91 
93 {
94 }
95 
96 void CMMDKernelSelectionMedian::init()
97 {
98  SG_ADD(&m_num_data_distance, "m_num_data_distance", "Number of elements to "
99  "to compute median distance on", MS_NOT_AVAILABLE);
100 
101  /* this is a sensible value */
102  m_num_data_distance=1000;
103 }
104 
106 {
107  SG_ERROR("%s::compute_measures(): Not implemented. Use select_kernel() "
108  "method!\n", get_name());
109  return SGVector<float64_t>();
110 }
111 
113 {
114  /* number of data for distace */
116 
117  SGMatrix<float64_t> dists;
118 
119  /* compute all pairwise distances, depends which mmd statistic is used */
121  {
122  /* fixed data, create merged copy of a random subset */
123 
124  /* create vector with that correspond to the num_data first points of
125  * each distribution, remember data is stored jointly */
126  SGVector<index_t> subset(num_data*2);
128  for (index_t i=0; i<num_data; ++i)
129  {
130  /* num_data samples from each half of joint sample */
131  subset[i]=i;
132  subset[i+num_data]=i+m;
133  }
134 
135  /* add subset and compute pairwise distances */
137  CFeatures* features=quad_mmd->get_p_and_q();
138  features->add_subset(subset);
139 
140  /* cast is safe, see constructor */
141  CDenseFeatures<float64_t>* dense_features=
142  (CDenseFeatures<float64_t>*) features;
143 
144  CEuclideanDistance* distance=new CEuclideanDistance(dense_features,
145  dense_features);
146  dists=distance->get_distance_matrix();
147  features->remove_subset();
148  SG_UNREF(distance);
149  SG_UNREF(features);
150  }
152  {
153  /* just stream the desired number of points */
155 
156  CStreamingFeatures* p=linear_mmd->get_streaming_p();
157  CStreamingFeatures* q=linear_mmd->get_streaming_q();
158 
159  /* cast is safe, see constructor */
161  p->get_streamed_features(num_data);
163  q->get_streamed_features(num_data);
164 
165  /* for safety */
166  SG_REF(p_streamed);
167  SG_REF(q_streamed);
168 
169  /* create merged feature object */
171  p_streamed->create_merged_copy(q_streamed);
172 
173  /* compute pairwise distances */
174  CEuclideanDistance* distance=new CEuclideanDistance(merged, merged);
175  dists=distance->get_distance_matrix();
176 
177  /* clean up */
178  SG_UNREF(distance);
179  SG_UNREF(p_streamed);
180  SG_UNREF(q_streamed);
181  SG_UNREF(p);
182  SG_UNREF(q);
183  }
184 
185  /* create a vector where the zeros have been removed, use upper triangle
186  * only since distances are symmetric */
187  SGVector<float64_t> dist_vec(dists.num_rows*(dists.num_rows-1)/2);
188  index_t write_idx=0;
189  for (index_t i=0; i<dists.num_rows; ++i)
190  {
191  for (index_t j=i+1; j<dists.num_rows; ++j)
192  dist_vec[write_idx++]=dists(i,j);
193  }
194 
195  /* now we have distance matrix, compute median, allow to modify matrix */
196  float64_t median_distance=CStatistics::median(dist_vec, true);
197  SG_DEBUG("median_distance: %f\n", median_distance);
198 
199  /* shogun has no square and factor two in its kernel width, MATLAB does
200  * median_width = sqrt(0.5*median_distance), we do this */
201  float64_t shogun_sigma=median_distance;
202  SG_DEBUG("kernel width (shogun): %f\n", shogun_sigma);
203 
204  /* now of all kernels, find the one which has its width closest
205  * Cast is safe due to constructor of MMDKernelSelection class */
207  float64_t min_distance=CMath::MAX_REAL_NUMBER;
208  CKernel* min_kernel=NULL;
210  for (index_t i=0; i<combined->get_num_subkernels(); ++i)
211  {
212  CKernel* current=combined->get_kernel(i);
213  REQUIRE(current->get_kernel_type()==K_GAUSSIAN, "%s::select_kernel(): "
214  "%d-th kernel is not a Gaussian but \"%s\"!\n", get_name(), i,
215  current->get_name());
216 
217  /* check if width is closer to median width */
218  distance=CMath::abs(((CGaussianKernel*)current)->get_width()-
219  shogun_sigma);
220 
221  if (distance<min_distance)
222  {
223  min_distance=distance;
224  min_kernel=current;
225  }
226 
227  /* next kernel */
228  SG_UNREF(current);
229  }
230  SG_UNREF(combined);
231 
232  /* returned referenced kernel */
233  SG_REF(min_kernel);
234  return min_kernel;
235 }
virtual const char * get_name() const =0
float distance(CJLCoverTreePoint p1, CJLCoverTreePoint p2, float64_t upper_bound)
virtual CStreamingFeatures * get_streaming_q()
virtual CStreamingFeatures * get_streaming_p()
int32_t index_t
Definition: common.h:62
virtual CFeatures * get_streamed_features(index_t num_elements)
CFeatures * get_rhs()
Definition: Kernel.h:510
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
Base class for kernel selection for MMD-based two-sample test statistic implementations. Provides abstract methods for selecting kernels and computing criteria or kernel weights for the implemented method. In order to implement new methods for kernel selection, simply write a new implementation of this class.
static float64_t median(SGVector< float64_t > values, bool modify=false, bool in_place=false)
Definition: Statistics.cpp:40
Kernel two sample test base class. Provides an interface for performing a two-sample test using a ker...
#define SG_REF(x)
Definition: SGObject.h:51
virtual SGVector< float64_t > compute_measures()
This class implements the quadratic time Maximum Mean Statistic as described in [1]. The MMD is the distance of two probability distributions and in a RKHS which we denote by .
virtual const char * get_name() const
CKernel * get_kernel(int32_t idx)
double float64_t
Definition: common.h:50
virtual EFeatureClass get_feature_class() const =0
The Combined kernel is used to combine a number of kernels into a single CombinedKernel object by lin...
CFeatures * create_merged_copy(CList *other)
The well known Gaussian kernel (swiss army knife for SVMs) computed on CDotFeatures.
CKernelTwoSampleTest * m_estimator
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
virtual EKernelType get_kernel_type()=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
Streaming features are features which are used for online algorithms.
This class implements the linear time Maximum Mean Statistic as described in [1] for streaming data (...
Definition: LinearTimeMMD.h:66
virtual EStatisticType get_statistic_type() const =0
The Kernel base class.
Definition: Kernel.h:158
SGMatrix< float64_t > get_distance_matrix()
Definition: Distance.h:127
#define SG_ADD(...)
Definition: SGObject.h:81
virtual CFeatures * get_p_and_q()
class EuclideanDistance
static const float64_t MAX_REAL_NUMBER
Definition: Math.h:2061
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
virtual EFeatureType get_feature_type() const =0
static T abs(T a)
Definition: Math.h:179
CFeatures * get_lhs()
Definition: Kernel.h:504

SHOGUN Machine Learning Toolbox - Documentation