SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
StreamingMMD.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2012-2013 Heiko Strathmann
4  * Written (w) 2014 Soumyajit De
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice, this
11  * list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * The views and conclusions contained in the software and documentation are those
28  * of the authors and should not be interpreted as representing official policies,
29  * either expressed or implied, of the Shogun Development Team.
30  */
31 
36 #include <shogun/lib/List.h>
37 
38 using namespace shogun;
39 
41 {
42  init();
43 }
44 
46  CStreamingFeatures* q, index_t m, index_t blocksize) :
47  CKernelTwoSampleTest(kernel, NULL, m)
48 {
49  init();
50 
51  m_streaming_p=p;
53 
54  m_streaming_q=q;
56 
57  m_blocksize=blocksize;
58 }
59 
61 {
64 
65  /* m_kernel is SG_UNREFed in base desctructor */
66 }
67 
68 void CStreamingMMD::init()
69 {
70  SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p",
72  SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p",
74  SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once",
76  SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed",
78 
79  m_streaming_p=NULL;
80  m_streaming_q=NULL;
81  m_blocksize=10000;
82  m_simulate_h0=false;
83 }
84 
86 {
87  /* use wrapper method and compute for single kernel */
88  SGVector<float64_t> statistic;
89  SGVector<float64_t> variance;
90  compute_statistic_and_variance(statistic, variance, false);
91 
92  return statistic[0];
93 }
94 
96 {
97  /* make sure multiple_kernels flag is used only with a combined kernel */
98  REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
99  "multiple kernels specified, but underlying kernel is not of type "
100  "K_COMBINED\n");
101 
102  SGVector<float64_t> statistic;
103  SGVector<float64_t> variance;
104  compute_statistic_and_variance(statistic, variance, multiple_kernels);
105 
106  return statistic;
107 }
108 
110 {
111  /* use wrapper method and compute for single kernel */
112  SGVector<float64_t> statistic;
113  SGVector<float64_t> variance;
114  compute_statistic_and_variance(statistic, variance, false);
115 
116  return variance[0];
117 }
118 
120 {
121  float64_t result=0;
122 
124  {
125  case MMD1_GAUSSIAN:
126  {
127  /* compute variance and use to estimate Gaussian distribution */
129  result=1.0-CStatistics::normal_cdf(statistic, std_dev);
130  }
131  break;
132 
133  default:
134  /* sampling null is handled here */
135  result=CKernelTwoSampleTest::compute_p_value(statistic);
136  break;
137  }
138 
139  return result;
140 }
141 
143 {
144  float64_t result=0;
145 
147  {
148  case MMD1_GAUSSIAN:
149  {
150  /* compute variance and use to estimate Gaussian distribution */
152  result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
153  }
154  break;
155 
156  default:
157  /* sampling null is handled here */
159  break;
160  }
161 
162  return result;
163 }
164 
166 {
167  float64_t result=0;
168 
170  {
171  case MMD1_GAUSSIAN:
172  {
173  /* compute variance and use to estimate Gaussian distribution, use
174  * wrapper method and compute for single kernel */
175  SGVector<float64_t> statistic;
176  SGVector<float64_t> variance;
177  compute_statistic_and_variance(statistic, variance, false);
178 
179  /* estimate Gaussian distribution */
180  result=1.0-CStatistics::normal_cdf(statistic[0],
181  CMath::sqrt(variance[0]));
182  }
183  break;
184 
185  default:
186  /* sampling null can be done separately in superclass */
188  break;
189  }
190 
191  return result;
192 }
193 
195 {
197 
198  /* instead of permutating samples, just samples new data all the time. */
201  SG_REF(p);
202  SG_REF(q);
203 
204  bool old=m_simulate_h0;
205  set_simulate_h0(true);
206  for (index_t i=0; i<m_num_null_samples; ++i)
207  {
208  /* compute statistic for this permutation of mixed samples */
209  samples[i]=compute_statistic();
210  }
211  set_simulate_h0(old);
212  m_streaming_p=p;
213  m_streaming_q=q;
214  SG_UNREF(p);
215  SG_UNREF(q);
216 
217  return samples;
218 }
219 
221  index_t num_this_run)
222 {
223  SG_DEBUG("entering!\n");
224 
225  /* the list of blocks of data to be returned, turning delete_data flag
226  * on which SG_REFs the elements when appended or returned. */
227  CList* data=new CList(true);
228 
229  SG_DEBUG("streaming %d blocks from p of blocksize %d!\n", num_blocks,
230  num_this_run);
231 
232  /* stream data from p num_blocks of time*/
233  for (index_t i=0; i<num_blocks; ++i)
234  {
236  data->append_element(block);
237  }
238 
239  SG_DEBUG("streaming %d blocks from q of blocksize %d!\n", num_blocks,
240  num_this_run);
241 
242  /* stream data from q num_blocks of time*/
243  for (index_t i=0; i<num_blocks; ++i)
244  {
246  data->append_element(block);
247  }
248 
249  /* check whether h0 should be simulated and permute if so */
250  if (m_simulate_h0)
251  {
252  /* create merged copy of all feature instances to permute */
253  SG_DEBUG("merging and premuting features!\n");
254 
255  /* use the first element to merge rest of the data into */
256  CFeatures* first=(CFeatures*)data->get_first_element();
257 
258  /* this delete element doesn't deallocate first element but just removes
259  * from the list and does a SG_UNREF. But its not deleted because
260  * get_first_element() does a SG_REF before returning so we need to later
261  * manually take care of its destruction via SG_UNREF here itself */
262  data->delete_element();
263 
264  CFeatures* merged=first->create_merged_copy(data);
265 
266  /* now we can get rid of unnecessary feature objects */
267  SG_UNREF(first);
268  data->delete_all_elements();
269 
270  /* permute */
271  SGVector<index_t> inds(merged->get_num_vectors());
272  inds.range_fill();
273  CMath::permute(inds);
274  merged->add_subset(inds);
275 
276  /* copy back */
277  SGVector<index_t> copy(num_this_run);
278  copy.range_fill();
279  for (index_t i=0; i<2*num_blocks; ++i)
280  {
281  CFeatures* current=merged->copy_subset(copy);
282  data->append_element(current);
283  /* SG_UNREF'ing since copy_subset does a SG_REF, this is
284  * safe since the object is already SG_REF'ed inside the list */
285  SG_UNREF(current);
286 
287  if (i<2*num_blocks-1)
288  copy.add(num_this_run);
289  }
290 
291  /* clean up */
292  SG_UNREF(merged);
293  }
294 
295  SG_REF(data);
296 
297  SG_DEBUG("leaving!\n");
298  return data;
299 }
300 
302 {
303  SG_ERROR("Method not implemented since linear time mmd is based on "
304  "streaming features\n");
305 }
306 
308 {
309  SG_ERROR("Method not implemented since linear time mmd is based on "
310  "streaming features\n");
311  return NULL;
312 }
313 
315 {
317  return m_streaming_p;
318 }
319 
321 {
323  return m_streaming_q;
324 }
325 
void range_fill(T start=0)
Definition: SGVector.cpp:173
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:1144
virtual float64_t compute_threshold(float64_t alpha)
virtual void compute_statistic_and_variance(SGVector< float64_t > &statistic, SGVector< float64_t > &variance, bool multiple_kernels=false)=0
virtual float64_t compute_threshold(float64_t alpha)
virtual float64_t compute_p_value(float64_t statistic)
virtual CStreamingFeatures * get_streaming_q()
virtual CStreamingFeatures * get_streaming_p()
int32_t index_t
Definition: common.h:62
static float64_t inverse_normal_cdf(float64_t y0)
virtual CFeatures * get_streamed_features(index_t num_elements)
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
void set_simulate_h0(bool simulate_h0)
Definition: StreamingMMD.h:263
Kernel two sample test base class. Provides an interface for performing a two-sample test using a ker...
CSGObject * delete_element()
Definition: List.h:502
virtual SGVector< float64_t > sample_null()
#define SG_REF(x)
Definition: SGObject.h:51
virtual float64_t compute_p_value(float64_t statistic)
virtual float64_t compute_variance_estimate()
CStreamingFeatures * m_streaming_q
Definition: StreamingMMD.h:293
virtual CFeatures * create_merged_copy(CList *others)
Definition: Features.h:235
CSGObject * get_first_element()
Definition: List.h:151
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
double float64_t
Definition: common.h:50
virtual CFeatures * get_p_and_q()
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual EKernelType get_kernel_type()=0
virtual CFeatures * copy_subset(SGVector< index_t > indices)
Definition: Features.cpp:340
The class Features is the base class of all feature objects.
Definition: Features.h:68
bool append_element(CSGObject *data)
Definition: List.h:331
Streaming features are features which are used for online algorithms.
static float64_t normal_cdf(float64_t x, float64_t std_dev=1)
ENullApproximationMethod m_null_approximation_method
virtual float64_t perform_test()
The Kernel base class.
Definition: Kernel.h:158
CStreamingFeatures * m_streaming_p
Definition: StreamingMMD.h:290
#define SG_ADD(...)
Definition: SGObject.h:81
static float32_t sqrt(float32_t x)
Definition: Math.h:459
void delete_all_elements()
Definition: List.h:118
virtual void set_p_and_q(CFeatures *p_and_q)
CList * stream_data_blocks(index_t num_blocks, index_t num_this_run)
virtual float64_t perform_test()
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310
Class List implements a doubly connected list for low-level-objects.
Definition: List.h:84
Block< Matrix > block(Matrix matrix, index_t row_begin, index_t col_begin, index_t row_size, index_t col_size)
Definition: Block.h:102
virtual float64_t compute_statistic()
void add(const SGVector< T > x)
Definition: SGVector.cpp:281

SHOGUN Machine Learning Toolbox - Documentation