SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LinearTimeMMD.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2012-2013 Heiko Strathmann
4  * Written (w) 2014 Soumyajit De
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice, this
11  * list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * The views and conclusions contained in the software and documentation are those
28  * of the authors and should not be interpreted as representing official policies,
29  * either expressed or implied, of the Shogun Development Team.
30  */
31 
38 #include <shogun/lib/List.h>
39 
40 #include <shogun/lib/external/libqp.h>
41 
42 using namespace shogun;
43 
45 {
46 }
47 
49  CStreamingFeatures* q, index_t m, index_t blocksize)
50  : CStreamingMMD(kernel, p, q, m, blocksize)
51 {
52 }
53 
55 {
56 }
57 
61  SGVector<float64_t>& qp, index_t num_this_run)
62 {
63  SG_DEBUG("entering!\n");
64 
65  REQUIRE(data->get_num_elements()==4, "Wrong number of blocks!\n");
66 
67  /* cast is safe the list is passed inside the class
68  * features will be SG_REF'ed once again by these get methods */
70  CFeatures* p2=(CFeatures*)data->get_next_element();
71  CFeatures* q1=(CFeatures*)data->get_next_element();
72  CFeatures* q2=(CFeatures*)data->get_next_element();
73 
74  SG_DEBUG("computing MMD values for current kernel!\n");
75 
76  /* compute kernel matrix diagonals */
77  kernel->init(p1, p2);
78  kernel->get_kernel_diagonal(pp);
79 
80  kernel->init(q1, q2);
81  kernel->get_kernel_diagonal(qq);
82 
83  kernel->init(p1, q2);
84  kernel->get_kernel_diagonal(pq);
85 
86  kernel->init(q1, p2);
87  kernel->get_kernel_diagonal(qp);
88 
89  /* cleanup */
90  SG_UNREF(p1);
91  SG_UNREF(p2);
92  SG_UNREF(q1);
93  SG_UNREF(q2);
94 
95  /* compute sum of current h terms for current kernel */
96 
97  for (index_t i=0; i<num_this_run; ++i)
98  current[i]=pp[i]+qq[i]-pq[i]-qp[i];
99 
100  SG_DEBUG("leaving!\n");
101 }
102 
104  CList* data, index_t num_this_run)
105 {
106  /* wrapper method used for convenience for using preallocated memory */
107  SGVector<float64_t> current(num_this_run);
108  SGVector<float64_t> pp(num_this_run);
109  SGVector<float64_t> qq(num_this_run);
110  SGVector<float64_t> pq(num_this_run);
111  SGVector<float64_t> qp(num_this_run);
112  compute_squared_mmd(kernel, data, current, pp, qq, pq, qp, num_this_run);
113  return current;
114 }
115 
117  SGVector<float64_t>& statistic, SGVector<float64_t>& variance,
118  bool multiple_kernels)
119 {
120  SG_DEBUG("entering!\n")
121 
122  REQUIRE(m_streaming_p, "streaming features p required!\n");
123  REQUIRE(m_streaming_q, "streaming features q required!\n");
124 
125  REQUIRE(m_kernel, "kernel needed!\n");
126 
127  /* make sure multiple_kernels flag is used only with a combined kernel */
128  REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
129  "multiple kernels specified, but underlying kernel is not of type "
130  "K_COMBINED\n");
131 
132  /* m is number of samples from each distribution, m_2 is half of it
133  * using names from JLMR paper (see class documentation) */
134  index_t m_2=m_m/2;
135 
136  SG_DEBUG("m_m=%d\n", m_m)
137 
138  /* find out whether single or multiple kernels (cast is safe, check above) */
139  index_t num_kernels=1;
140  if (multiple_kernels)
141  {
142  num_kernels=((CCombinedKernel*)m_kernel)->get_num_subkernels();
143  SG_DEBUG("computing MMD and variance for %d sub-kernels\n",
144  num_kernels);
145  }
146 
147  /* allocate memory for results if vectors are empty */
148  if (!statistic.vector)
149  statistic=SGVector<float64_t>(num_kernels);
150 
151  if (!variance.vector)
152  variance=SGVector<float64_t>(num_kernels);
153 
154  /* ensure right dimensions */
155  REQUIRE(statistic.vlen==num_kernels,
156  "statistic vector size (%d) does not match number of kernels (%d)\n",
157  statistic.vlen, num_kernels);
158 
159  REQUIRE(variance.vlen==num_kernels,
160  "variance vector size (%d) does not match number of kernels (%d)\n",
161  variance.vlen, num_kernels);
162 
163  /* temp variable in the algorithm */
164  float64_t delta;
165 
166  /* initialise statistic and variance since they are cumulative */
167  statistic.zero();
168  variance.zero();
169 
170  /* needed for online mean and variance */
171  SGVector<index_t> term_counters(num_kernels);
172  term_counters.set_const(1);
173 
174  /* term counter to compute online mean and variance */
175  index_t num_examples_processed=0;
176  while (num_examples_processed<m_2)
177  {
178  /* number of example to look at in this iteration */
179  index_t num_this_run=CMath::min(m_blocksize,
180  CMath::max(0, m_2-num_examples_processed));
181  SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
182  "is %d\n", num_this_run, num_examples_processed, m_blocksize);
183 
184  /* stream 2 data blocks from each distribution */
185  CList* data=stream_data_blocks(2, num_this_run);
186 
187  /* if multiple kernels are used, compute all of them on streamed data,
188  * if multiple kernels flag is false, the above loop will be executed
189  * only once */
190  CKernel* kernel=m_kernel;
191  if (multiple_kernels)
192  SG_DEBUG("using multiple kernels\n");
193 
194  /* iterate through all kernels for this data */
195 
196  for (index_t i=0; i<num_kernels; ++i)
197  {
198  /* if multiple kernels should be computed, set next kernel */
199  if (multiple_kernels)
200  kernel=((CCombinedKernel*)m_kernel)->get_kernel(i);
201 
202  /* compute linear time MMD values */
203  SGVector<float64_t> current=compute_squared_mmd(kernel, data,
204  num_this_run);
205 
206  /* single variances for all kernels. Update mean and variance
207  * using Knuth's online variance algorithm.
208  * C.f. for example Wikipedia */
209  for (index_t j=0; j<num_this_run; ++j)
210  {
211  /* D. Knuth's online variance algorithm for current kernel */
212  delta=current[j]-statistic[i];
213  statistic[i]+=delta/term_counters[i]++;
214  variance[i]+=delta*(current[j]-statistic[i]);
215 
216  SG_DEBUG("burst: current=%f, delta=%f, statistic=%f, "
217  "variance=%f, kernel_idx=%d\n", current[j], delta,
218  statistic[i], variance[i], i);
219  }
220 
221  if (multiple_kernels)
222  SG_UNREF(kernel);
223  }
224 
225  /* clean up streamed data, this frees the feature objects */
226  SG_UNREF(data);
227 
228  /* add number of processed examples for this run */
229  num_examples_processed+=num_this_run;
230  }
231  SG_DEBUG("Done compouting statistic, processed 2*%d examples.\n",
232  num_examples_processed);
233 
234  /* mean of sum all traces is linear time mmd, copy entries for all kernels */
236  statistic.display_vector("statistics");
237 
238  /* variance of terms can be computed using mean (statistic).
239  * Note that the variance needs to be divided by m_2 in order to get
240  * variance of null-distribution */
241  for (index_t i=0; i<num_kernels; ++i)
242  variance[i]=variance[i]/(m_2-1)/m_2;
243 
245  variance.display_vector("variances");
246 
247  SG_DEBUG("leaving!\n")
248 }
249 
252 {
253  SG_DEBUG("entering!\n")
254 
255  REQUIRE(m_streaming_p, "streaming features p required!\n");
256  REQUIRE(m_streaming_q, "streaming features q required!\n");
257 
258  REQUIRE(m_kernel, "kernel needed!\n");
259 
260  /* make sure multiple_kernels flag is used only with a combined kernel */
262  "underlying kernel is not of type K_COMBINED\n");
263 
264  /* cast combined kernel */
266 
267  /* m is number of samples from each distribution, m_4 is quarter of it */
268  REQUIRE(m_m>=4, "Need at least m>=4\n");
269  index_t m_4=m_m/4;
270 
271  SG_DEBUG("m_m=%d\n", m_m)
272 
273  /* find out whether single or multiple kernels (cast is safe, check above) */
274  index_t num_kernels=combined->get_num_subkernels();
275  REQUIRE(num_kernels>0, "At least one kernel is needed\n");
276 
277  /* allocate memory for results if vectors are empty */
278  if (!statistic.vector)
279  statistic=SGVector<float64_t>(num_kernels);
280 
281  if (!Q.matrix)
282  Q=SGMatrix<float64_t>(num_kernels, num_kernels);
283 
284  /* ensure right dimensions */
285  REQUIRE(statistic.vlen==num_kernels,
286  "statistic vector size (%d) does not match number of kernels (%d)\n",
287  statistic.vlen, num_kernels);
288 
289  REQUIRE(Q.num_rows==num_kernels,
290  "Q number of rows does (%d) not match number of kernels (%d)\n",
291  Q.num_rows, num_kernels);
292 
293  REQUIRE(Q.num_cols==num_kernels,
294  "Q number of columns (%d) does not match number of kernels (%d)\n",
295  Q.num_cols, num_kernels);
296 
297  /* initialise statistic and variance since they are cumulative */
298  statistic.zero();
299  Q.zero();
300 
301  /* produce two kernel lists to iterate doubly nested */
302  CList* list_i=new CList();
303  CList* list_j=new CList();
304 
305  for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
306  {
307  CKernel* kernel = combined->get_kernel(k_idx);
308  list_i->append_element(kernel);
309  list_j->append_element(kernel);
310  SG_UNREF(kernel);
311  }
312 
313  /* needed for online mean and variance */
314  SGVector<index_t> term_counters_statistic(num_kernels);
315  SGMatrix<index_t> term_counters_Q(num_kernels, num_kernels);
316  term_counters_statistic.set_const(1);
317  term_counters_Q.set_const(1);
318 
319  index_t num_examples_processed=0;
320  while (num_examples_processed<m_4)
321  {
322  /* number of example to look at in this iteration */
323  index_t num_this_run=CMath::min(m_blocksize,
324  CMath::max(0, m_4-num_examples_processed));
325  SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
326  "is %d\n", num_this_run, num_examples_processed, m_blocksize);
327 
328  /* stream 4 data blocks from each distribution */
329  CList* data=stream_data_blocks(4, num_this_run);
330 
331  /* create two sets of data, a and b, from alternative blocks */
332  CList* data_a=new CList(true);
333  CList* data_b=new CList(true);
334 
335  /* take care of refcounts */
336  int32_t num_elements=data->get_num_elements();
337  CFeatures* current=(CFeatures*)data->get_first_element();
338  data_a->append_element(current);
339  SG_UNREF(current);
340  current=(CFeatures*)data->get_next_element();
341  data_b->append_element(current);
342  SG_UNREF(current);
343  num_elements-=2;
344  /* loop counter is safe since num_elements can only be even */
345  while (num_elements)
346  {
347  current=(CFeatures*)data->get_next_element();
348  data_a->append_element(current);
349  SG_UNREF(current);
350  current=(CFeatures*)data->get_next_element();
351  data_b->append_element(current);
352  SG_UNREF(current);
353  num_elements-=2;
354  }
355  /* safely unref previous list of data, decreases refcounts of features
356  * but doesn't delete them */
357  SG_UNREF(data);
358 
359  /* now for each of these streamed data instances, iterate through all
360  * kernels and update Q matrix while also computing MMD statistic */
361 
362  /* preallocate some memory for faster processing */
363  SGVector<float64_t> pp(num_this_run);
364  SGVector<float64_t> qq(num_this_run);
365  SGVector<float64_t> pq(num_this_run);
366  SGVector<float64_t> qp(num_this_run);
367  SGVector<float64_t> h_i_a(num_this_run);
368  SGVector<float64_t> h_i_b(num_this_run);
369  SGVector<float64_t> h_j_a(num_this_run);
370  SGVector<float64_t> h_j_b(num_this_run);
371 
372  /* iterate through Q matrix and update values, compute mmd */
373  CKernel* kernel_i=(CKernel*)list_i->get_first_element();
374  for (index_t i=0; i<num_kernels; ++i)
375  {
376  /* compute all necessary 8 h-vectors for this burst.
377  * h_delta-terms for each kernel, expression 7 of NIPS paper */
378 
379  /* first kernel, a-part */
380  compute_squared_mmd(kernel_i, data_a, h_i_a, pp, qq, pq, qp,
381  num_this_run);
382 
383  /* first kernel, b-part */
384  compute_squared_mmd(kernel_i, data_b, h_i_b, pp, qq, pq, qp,
385  num_this_run);
386 
387  /* iterate through j, but use symmetry in order to save half of the
388  * computations */
389  CKernel* kernel_j=(CKernel*)list_j->get_first_element();
390  for (index_t j=0; j<=i; ++j)
391  {
392  /* compute all necessary 8 h-vectors for this burst.
393  * h_delta-terms for each kernel, expression 7 of NIPS paper */
394 
395  /* second kernel, a-part */
396  compute_squared_mmd(kernel_j, data_a, h_j_a, pp, qq, pq, qp,
397  num_this_run);
398 
399  /* second kernel, b-part */
400  compute_squared_mmd(kernel_j, data_b, h_j_b, pp, qq, pq, qp,
401  num_this_run);
402 
403  float64_t term;
404  for (index_t it=0; it<num_this_run; ++it)
405  {
406  /* current term of expression 7 of NIPS paper */
407  term=(h_i_a[it]-h_i_b[it])*(h_j_a[it]-h_j_b[it]);
408 
409  /* update covariance element for the current burst. This is a
410  * running average of the product of the h_delta terms of each
411  * kernel */
412  Q(i, j)+=(term-Q(i, j))/term_counters_Q(i, j)++;
413  }
414 
415  /* use symmetry */
416  Q(j, i)=Q(i, j);
417 
418  /* next kernel j */
419  kernel_j=(CKernel*)list_j->get_next_element();
420  }
421 
422  /* update MMD statistic online computation for kernel i, using
423  * vectors that were computed above */
424  SGVector<float64_t> h(num_this_run*2);
425  for (index_t it=0; it<num_this_run; ++it)
426  {
427  /* update statistic for kernel i (outer loop) and update using
428  * all elements of the h_i_a, h_i_b vectors (iterate over it) */
429  statistic[i]=statistic[i]+
430  (h_i_a[it]-statistic[i])/term_counters_statistic[i]++;
431 
432  /* Make sure to use all data, i.e. part a and b */
433  statistic[i]=statistic[i]+
434  (h_i_b[it]-statistic[i])/(term_counters_statistic[i]++);
435  }
436 
437  /* next kernel i */
438  kernel_i=(CKernel*)list_i->get_next_element();
439  }
440 
441  /* clean up streamed data */
442  SG_UNREF(data_a);
443  SG_UNREF(data_b);
444 
445  /* add number of processed examples for this run */
446  num_examples_processed+=num_this_run;
447  }
448 
449  /* clean up */
450  SG_UNREF(list_i);
451  SG_UNREF(list_j);
452 
453  SG_DEBUG("Done compouting statistic, processed 4*%d examples.\n",
454  num_examples_processed);
455 
456  SG_DEBUG("leaving!\n")
457 }
458 
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Kernel.cpp:98
CSGObject * get_next_element()
Definition: List.h:185
int32_t index_t
Definition: common.h:62
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:376
virtual void compute_statistic_and_Q(SGVector< float64_t > &statistic, SGMatrix< float64_t > &Q)
index_t num_rows
Definition: SGMatrix.h:374
void display_vector(const char *name="vector", const char *prefix="") const
Definition: SGVector.cpp:354
CStreamingFeatures * m_streaming_q
Definition: StreamingMMD.h:293
index_t vlen
Definition: SGVector.h:494
CSGObject * get_first_element()
Definition: List.h:151
virtual void compute_statistic_and_variance(SGVector< float64_t > &statistic, SGVector< float64_t > &variance, bool multiple_kernels=false)
Abstract base class that provides an interface for performing kernel two-sample test on streaming dat...
Definition: StreamingMMD.h:88
double float64_t
Definition: common.h:50
virtual SGVector< float64_t > compute_squared_mmd(CKernel *kernel, CList *data, index_t num_this_run)
int32_t get_num_elements()
Definition: List.h:145
The Combined kernel is used to combine a number of kernels into a single CombinedKernel object by lin...
static T max(T a, T b)
Definition: Math.h:168
EMessageType get_loglevel() const
Definition: SGIO.cpp:285
#define SG_UNREF(x)
Definition: SGObject.h:55
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual EKernelType get_kernel_type()=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
bool append_element(CSGObject *data)
Definition: List.h:331
Streaming features are features which are used for online algorithms.
SGVector< float64_t > get_kernel_diagonal(SGVector< float64_t > preallocated=SGVector< float64_t >())
Definition: Kernel.h:231
The Kernel base class.
Definition: Kernel.h:159
CStreamingFeatures * m_streaming_p
Definition: StreamingMMD.h:290
CList * stream_data_blocks(index_t num_blocks, index_t num_this_run)
void set_const(T const_elem)
Definition: SGMatrix.cpp:129
void set_const(T const_elem)
Definition: SGVector.cpp:150
Class List implements a doubly connected list for low-level-objects.
Definition: List.h:84

SHOGUN Machine Learning Toolbox - Documentation