SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MMDKernelSelectionMedian.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Heiko Strathmann
8  */
9 
18 
19 
20 using namespace shogun;
21 
24 {
25  init();
26 }
27 
29  CKernelTwoSampleTestStatistic* mmd, index_t num_data_distance) :
31 {
32  /* assert that a combined kernel is used */
33  CKernel* kernel=mmd->get_kernel();
34  CFeatures* lhs=kernel->get_lhs();
35  CFeatures* rhs=kernel->get_rhs();
36  REQUIRE(kernel, "%s::%s(): No kernel set!\n", get_name(), get_name());
37  REQUIRE(kernel->get_kernel_type()==K_COMBINED, "%s::%s(): Requires "
38  "CombinedKernel as kernel. Yours is %s", get_name(), get_name(),
39  kernel->get_name());
40 
41  /* assert that all subkernels are Gaussian kernels */
42  CCombinedKernel* combined=(CCombinedKernel*)kernel;
43 
44  for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
45  {
46  CKernel* subkernel=combined->get_kernel(k_idx);
47  REQUIRE(kernel, "%s::%s(): Subkernel (%d) of current kernel is not"
48  " of type GaussianKernel\n", get_name(), get_name(), k_idx);
49  SG_UNREF(subkernel);
50  }
51 
52  /* assert 64 bit dense features since EuclideanDistance can only handle
53  * those */
55  {
56  CFeatures* features=((CQuadraticTimeMMD*)m_mmd)->get_p_and_q();
57  REQUIRE(features->get_feature_class()==C_DENSE &&
58  features->get_feature_type()==F_DREAL, "%s::select_kernel(): "
59  "Only 64 bit float dense features allowed, these are \"%s\""
60  " and of type %d\n",
61  get_name(), features->get_name(), features->get_feature_type());
62  SG_UNREF(features);
63  }
65  {
66  CStreamingFeatures* p=((CLinearTimeMMD*)m_mmd)->get_streaming_p();
67  CStreamingFeatures* q=((CLinearTimeMMD*)m_mmd)->get_streaming_q();
69  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
70  "Only 64 bit float streaming dense features allowed, these (p) "
71  "are \"%s\" and of type %d\n",
72  get_name(), p->get_name(), p->get_feature_type());
73 
75  p->get_feature_type()==F_DREAL, "%s::select_kernel(): "
76  "Only 64 bit float streaming dense features allowed, these (q) "
77  "are \"%s\" and of type %d\n",
78  get_name(), q->get_name(), q->get_feature_type());
79  SG_UNREF(p);
80  SG_UNREF(q);
81  }
82 
83  SG_UNREF(kernel);
84  SG_UNREF(lhs);
85  SG_UNREF(rhs);
86 
87  init();
88 
89  m_num_data_distance=num_data_distance;
90 }
91 
93 {
94 }
95 
96 void CMMDKernelSelectionMedian::init()
97 {
98  SG_ADD(&m_num_data_distance, "m_num_data_distance", "Number of elements to "
99  "to compute median distance on", MS_NOT_AVAILABLE);
100 
101  /* this is a sensible value */
102  m_num_data_distance=1000;
103 }
104 
106 {
107  SG_ERROR("%s::compute_measures(): Not implemented. Use select_kernel() "
108  "method!\n", get_name());
109  return SGVector<float64_t>();
110 }
111 
113 {
114  /* number of data for distace */
116 
117  SGMatrix<float64_t> dists;
118 
119  /* compute all pairwise distances, depends which mmd statistic is used */
121  {
122  /* fixed data, create merged copy of a random subset */
123 
124  /* create vector with that correspond to the num_data first points of
125  * each distribution, remember data is stored jointly */
126  SGVector<index_t> subset(num_data*2);
127  index_t m=m_mmd->get_m();
128  for (index_t i=0; i<num_data; ++i)
129  {
130  /* num_data samples from each half of joint sample */
131  subset[i]=i;
132  subset[i+num_data]=i+m;
133  }
134 
135  /* add subset and compute pairwise distances */
137  CFeatures* features=quad_mmd->get_p_and_q();
138  features->add_subset(subset);
139 
140  /* cast is safe, see constructor */
141  CDenseFeatures<float64_t>* dense_features=
142  (CDenseFeatures<float64_t>*) features;
143 
144  CEuclideanDistance* distance=new CEuclideanDistance(dense_features,
145  dense_features);
146  dists=distance->get_distance_matrix();
147  features->remove_subset();
148  SG_UNREF(distance);
149  SG_UNREF(features);
150  }
152  {
153  /* just stream the desired number of points */
154  CLinearTimeMMD* linear_mmd=(CLinearTimeMMD*)m_mmd;
155 
156  CStreamingFeatures* p=linear_mmd->get_streaming_p();
157  CStreamingFeatures* q=linear_mmd->get_streaming_q();
158 
159  /* cast is safe, see constructor */
161  p->get_streamed_features(num_data);
163  q->get_streamed_features(num_data);
164 
165  /* for safety */
166  SG_REF(p_streamed);
167  SG_REF(q_streamed);
168 
169  /* create merged feature object */
171  p_streamed->create_merged_copy(q_streamed);
172 
173  /* compute pairwise distances */
174  CEuclideanDistance* distance=new CEuclideanDistance(merged, merged);
175  dists=distance->get_distance_matrix();
176 
177  /* clean up */
178  SG_UNREF(distance);
179  SG_UNREF(p_streamed);
180  SG_UNREF(q_streamed);
181  SG_UNREF(p);
182  SG_UNREF(q);
183  }
184 
185  /* create a vector where the zeros have been removed, use upper triangle
186  * only since distances are symmetric */
187  SGVector<float64_t> dist_vec(dists.num_rows*(dists.num_rows-1)/2);
188  index_t write_idx=0;
189  for (index_t i=0; i<dists.num_rows; ++i)
190  {
191  for (index_t j=i+1; j<dists.num_rows; ++j)
192  dist_vec[write_idx++]=dists(i,j);
193  }
194 
195  /* now we have distance matrix, compute median, allow to modify matrix */
196  float64_t median_distance=CStatistics::median(dist_vec, true);
197  SG_DEBUG("median_distance: %f\n", median_distance);
198 
199  /* shogun has no square and factor two in its kernel width, MATLAB does
200  * median_width = sqrt(0.5*median_distance), we do this */
201  float64_t shogun_sigma=median_distance;
202  SG_DEBUG("kernel width (shogun): %f\n", shogun_sigma);
203 
204  /* now of all kernels, find the one which has its width closest
205  * Cast is safe due to constructor of MMDKernelSelection class */
207  float64_t min_distance=CMath::MAX_REAL_NUMBER;
208  CKernel* min_kernel=NULL;
210  for (index_t i=0; i<combined->get_num_subkernels(); ++i)
211  {
212  CKernel* current=combined->get_kernel(i);
213  REQUIRE(current->get_kernel_type()==K_GAUSSIAN, "%s::select_kernel(): "
214  "%d-th kernel is not a Gaussian but \"%s\"!\n", get_name(), i,
215  current->get_name());
216 
217  /* check if width is closer to median width */
218  distance=CMath::abs(((CGaussianKernel*)current)->get_width()-
219  shogun_sigma);
220 
221  if (distance<min_distance)
222  {
223  min_distance=distance;
224  min_kernel=current;
225  }
226 
227  /* next kernel */
228  SG_UNREF(current);
229  }
230  SG_UNREF(combined);
231 
232  /* returned referenced kernel */
233  SG_REF(min_kernel);
234  return min_kernel;
235 }

SHOGUN Machine Learning Toolbox - Documentation