SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
MMDKernelSelectionMedian.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Heiko Strathmann
8  */
9 
18 
19 
20 using namespace shogun;
21 
24 {
25  init();
26 }
27 
29  CKernelTwoSampleTest* mmd, index_t num_data_distance) :
31 {
32  /* assert that a combined kernel is used */
33  CKernel* kernel=mmd->get_kernel();
34  CFeatures* lhs=kernel->get_lhs();
35  CFeatures* rhs=kernel->get_rhs();
36  REQUIRE(kernel, "%s::%s(): No kernel set!\n", get_name(), get_name());
37  REQUIRE(kernel->get_kernel_type()==K_COMBINED, "%s::%s(): Requires "
38  "CombinedKernel as kernel. Yours is %s", get_name(), get_name(),
39  kernel->get_name());
40 
41  /* assert that all subkernels are Gaussian kernels */
42  CCombinedKernel* combined=(CCombinedKernel*)kernel;
43 
44  for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
45  {
46  CKernel* subkernel=combined->get_kernel(k_idx);
47  REQUIRE(kernel, "%s::%s(): Subkernel (%d) of current kernel is not"
48  " of type GaussianKernel\n", get_name(), get_name(), k_idx);
49  SG_UNREF(subkernel);
50  }
51 
52  /* assert 64 bit dense features since EuclideanDistance can only handle
53  * those */
55  {
56  CFeatures* features=((CQuadraticTimeMMD*)m_estimator)->get_p_and_q();
57  REQUIRE(features->get_feature_class()==C_DENSE &&
58  features->get_feature_type()==F_DREAL, "%s::select_kernel(): "
59  "Only 64 bit float dense features allowed, these are \"%s\""
60  " and of type %d\n",
61  get_name(), features->get_name(), features->get_feature_type());
62  SG_UNREF(features);
63  }
65  {
66  CStreamingFeatures* p=((CLinearTimeMMD*)m_estimator)->get_streaming_p();
67  CStreamingFeatures* q=((CLinearTimeMMD*)m_estimator)->get_streaming_q();
69  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
70  "Only 64 bit float streaming dense features allowed, these (p) "
71  "are \"%s\" and of type %d\n",
72  get_name(), p->get_name(), p->get_feature_type());
73 
75  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
76  "Only 64 bit float streaming dense features allowed, these (q) "
77  "are \"%s\" and of type %d\n",
78  get_name(), q->get_name(), q->get_feature_type());
79  SG_UNREF(p);
80  SG_UNREF(q);
81  }
82 
83  SG_UNREF(kernel);
84  SG_UNREF(lhs);
85  SG_UNREF(rhs);
86 
87  init();
88 
89  m_num_data_distance=num_data_distance;
90 }
91 
93 {
94 }
95 
96 void CMMDKernelSelectionMedian::init()
97 {
98  SG_ADD(&m_num_data_distance, "m_num_data_distance", "Number of elements to "
99  "to compute median distance on", MS_NOT_AVAILABLE);
100 
101  /* this is a sensible value */
102  m_num_data_distance=1000;
103 }
104 
106 {
107  SG_ERROR("%s::compute_measures(): Not implemented. Use select_kernel() "
108  "method!\n", get_name());
109  return SGVector<float64_t>();
110 }
111 
113 {
114  /* number of data for distace */
116 
117  SGMatrix<float64_t> dists;
118 
119  /* compute all pairwise distances, depends which mmd statistic is used */
121  {
122  /* fixed data, create merged copy of a random subset */
123 
124  /* create vector with that correspond to the num_data first points of
125  * each distribution, remember data is stored jointly */
126  SGVector<index_t> subset(num_data*2);
128  for (index_t i=0; i<num_data; ++i)
129  {
130  /* num_data samples from each half of joint sample */
131  subset[i]=i;
132  subset[i+num_data]=i+m;
133  }
134 
135  /* add subset and compute pairwise distances */
137  CFeatures* features=quad_mmd->get_p_and_q();
138  features->add_subset(subset);
139 
140  /* cast is safe, see constructor */
141  CDenseFeatures<float64_t>* dense_features=
142  (CDenseFeatures<float64_t>*) features;
143 
144  CEuclideanDistance* distance=new CEuclideanDistance(dense_features,
145  dense_features);
146  dists=distance->get_distance_matrix();
147  features->remove_subset();
148  SG_UNREF(distance);
149  SG_UNREF(features);
150  }
152  {
153  /* just stream the desired number of points */
155 
156  CStreamingFeatures* p=linear_mmd->get_streaming_p();
157  CStreamingFeatures* q=linear_mmd->get_streaming_q();
158 
159  /* cast is safe, see constructor */
161  p->get_streamed_features(num_data);
163  q->get_streamed_features(num_data);
164 
165  /* for safety */
166  SG_REF(p_streamed);
167  SG_REF(q_streamed);
168 
169  /* create merged feature object */
171  p_streamed->create_merged_copy(q_streamed);
172 
173  /* compute pairwise distances */
174  CEuclideanDistance* distance=new CEuclideanDistance(merged, merged);
175  dists=distance->get_distance_matrix();
176 
177  /* clean up */
178  SG_UNREF(distance);
179  SG_UNREF(p_streamed);
180  SG_UNREF(q_streamed);
181  SG_UNREF(p);
182  SG_UNREF(q);
183  }
184 
185  /* create a vector where the zeros have been removed, use upper triangle
186  * only since distances are symmetric */
187  SGVector<float64_t> dist_vec(dists.num_rows*(dists.num_rows-1)/2);
188  index_t write_idx=0;
189  for (index_t i=0; i<dists.num_rows; ++i)
190  {
191  for (index_t j=i+1; j<dists.num_rows; ++j)
192  dist_vec[write_idx++]=dists(i,j);
193  }
194 
195  /* now we have distance matrix, compute median, allow to modify matrix */
196  CMath::qsort<float64_t>(dist_vec.vector, dist_vec.vlen);
197  float64_t median_distance=dist_vec[dist_vec.vlen/2];
198  SG_DEBUG("median_distance: %f\n", median_distance);
199 
200  /* shogun has no square and factor two in its kernel width, MATLAB does
201  * median_width = sqrt(0.5*median_distance), we do this */
202  float64_t shogun_sigma=median_distance;
203  SG_DEBUG("kernel width (shogun): %f\n", shogun_sigma);
204 
205  /* now of all kernels, find the one which has its width closest
206  * Cast is safe due to constructor of MMDKernelSelection class */
208  float64_t min_distance=CMath::MAX_REAL_NUMBER;
209  CKernel* min_kernel=NULL;
211  for (index_t i=0; i<combined->get_num_subkernels(); ++i)
212  {
213  CKernel* current=combined->get_kernel(i);
214  REQUIRE(current->get_kernel_type()==K_GAUSSIAN, "%s::select_kernel(): "
215  "%d-th kernel is not a Gaussian but \"%s\"!\n", get_name(), i,
216  current->get_name());
217 
218  /* check if width is closer to median width */
219  distance=CMath::abs(((CGaussianKernel*)current)->get_width()-
220  shogun_sigma);
221 
222  if (distance<min_distance)
223  {
224  min_distance=distance;
225  min_kernel=current;
226  }
227 
228  /* next kernel */
229  SG_UNREF(current);
230  }
231  SG_UNREF(combined);
232 
233  /* returned referenced kernel */
234  SG_REF(min_kernel);
235  return min_kernel;
236 }
virtual const char * get_name() const =0
float distance(CJLCoverTreePoint p1, CJLCoverTreePoint p2, float64_t upper_bound)
virtual CStreamingFeatures * get_streaming_q()
virtual CStreamingFeatures * get_streaming_p()
int32_t index_t
Definition: common.h:62
virtual CFeatures * get_streamed_features(index_t num_elements)
CFeatures * get_rhs()
Definition: Kernel.h:511
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
Base class for kernel selection for MMD-based two-sample test statistic implementations. Provides abstract methods for selecting kernels and computing criteria or kernel weights for the implemented method. In order to implement new methods for kernel selection, simply write a new implementation of this class.
Kernel two sample test base class. Provides an interface for performing a two-sample test using a ker...
#define SG_REF(x)
Definition: SGObject.h:51
virtual SGVector< float64_t > compute_measures()
This class implements the quadratic time Maximum Mean Statistic as described in [1]. The MMD is the distance of two probability distributions and in a RKHS which we denote by .
index_t vlen
Definition: SGVector.h:492
virtual const char * get_name() const
CKernel * get_kernel(int32_t idx)
double float64_t
Definition: common.h:50
virtual EFeatureClass get_feature_class() const =0
The Combined kernel is used to combine a number of kernels into a single CombinedKernel object by lin...
CFeatures * create_merged_copy(CList *other)
The well known Gaussian kernel (swiss army knife for SVMs) computed on CDotFeatures.
CKernelTwoSampleTest * m_estimator
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void remove_subset()
Definition: Features.cpp:322
virtual EKernelType get_kernel_type()=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
Streaming features are features which are used for online algorithms.
This class implements the linear time Maximum Mean Statistic as described in [1] for streaming data (...
Definition: LinearTimeMMD.h:66
virtual EStatisticType get_statistic_type() const =0
The Kernel base class.
Definition: Kernel.h:159
SGMatrix< float64_t > get_distance_matrix()
Definition: Distance.h:156
#define SG_ADD(...)
Definition: SGObject.h:81
virtual CFeatures * get_p_and_q()
class EuclideanDistance
static const float64_t MAX_REAL_NUMBER
Definition: Math.h:2061
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
virtual EFeatureType get_feature_type() const =0
static T abs(T a)
Definition: Math.h:179
CFeatures * get_lhs()
Definition: Kernel.h:505

SHOGUN Machine Learning Toolbox - Documentation