SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RBM.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014, Shogun Toolbox Foundation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7 
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18 
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  *
31  * Written (W) 2014 Khaled Nasr
32  */
33 
34 #include <shogun/neuralnets/RBM.h>
35 
36 #ifdef HAVE_EIGEN3
37 
38 #include <shogun/base/Parameter.h>
41 
42 using namespace shogun;
43 
45 {
46  init();
47 }
48 
49 CRBM::CRBM(int32_t num_hidden)
50 {
51  init();
52  m_num_hidden = num_hidden;
53 }
54 
55 CRBM::CRBM(int32_t num_hidden, int32_t num_visible,
56  ERBMVisibleUnitType visible_unit_type) : CSGObject()
57 {
58  init();
59  m_num_hidden = num_hidden;
60  add_visible_group(num_visible, visible_unit_type);
61 }
62 
64 {
68 }
69 
70 void CRBM::add_visible_group(int32_t num_units, ERBMVisibleUnitType unit_type)
71 {
73  m_num_visible += num_units;
74 
77 
79 
80  if (n==0)
82  else
85 }
86 
88 {
91 
92  for (int32_t i=0; i<m_num_params; i++)
93  m_params[i] = CMath::normal_random(0.0,sigma);
94 }
95 
96 void CRBM::set_batch_size(int32_t batch_size)
97 {
98  if (m_batch_size == batch_size) return;
99 
100  m_batch_size = batch_size;
101 
104 
105  reset_chain();
106 }
107 
109 {
110  REQUIRE(features != NULL, "Invalid (NULL) feature pointer\n");
111  REQUIRE(features->get_num_features()==m_num_visible,
112  "Number of features (%i) must match the RBM's number of visible units "
113  "(%i)\n", features->get_num_features(), m_num_visible);
114 
115  SGMatrix<float64_t> inputs = features->get_feature_matrix();
116 
117  int32_t training_set_size = inputs.num_cols;
118  if (gd_mini_batch_size==0) gd_mini_batch_size = training_set_size;
120 
121  for (int32_t i=0; i<m_num_visible; i++)
122  for (int32_t j=0; j<m_batch_size; j++)
123  visible_state(i,j) = inputs(i,j);
124 
126 
127  // needed for momentum
128  SGVector<float64_t> param_updates(m_num_params);
129  param_updates.zero();
130 
131  float64_t alpha = gd_learning_rate;
132 
133  SGMatrix<float64_t> buffer;
138 
139  int32_t counter = 0;
140  for (int32_t i=0; i<max_num_epochs; i++)
141  {
142  for (int32_t j=0; j < training_set_size; j += gd_mini_batch_size)
143  {
144  alpha = gd_learning_rate_decay*alpha;
145 
146  if (j+gd_mini_batch_size>training_set_size)
147  j = training_set_size-gd_mini_batch_size;
148 
149  SGMatrix<float64_t> inputs_batch(inputs.matrix+j*inputs.num_rows,
150  inputs.num_rows, gd_mini_batch_size, false);
151 
152  for (int32_t k=0; k<m_num_params; k++)
153  m_params[k] += gd_momentum*param_updates[k];
154 
155  contrastive_divergence(inputs_batch, gradients);
156 
157  for (int32_t k=0; k<m_num_params; k++)
158  {
159  param_updates[k] = gd_momentum*param_updates[k]
160  -alpha*gradients[k];
161 
162  m_params[k] -= alpha*gradients[k];
163  }
164 
165  if (counter%monitoring_interval == 0)
166  {
168  SG_INFO("Epoch %i: reconstruction Error = %f\n",i,
169  reconstruction_error(inputs_batch, buffer));
171  SG_INFO("Epoch %i: Pseudo-log-likelihood = %f\n",i,
172  pseudo_likelihood(inputs_batch,buffer));
173  }
174  counter ++;
175  }
176  }
177 }
178 
179 void CRBM::sample(int32_t num_gibbs_steps,
180  int32_t batch_size)
181 {
182  set_batch_size(batch_size);
183 
184  for (int32_t i=0; i<num_gibbs_steps; i++)
185  {
189  if (i<num_gibbs_steps-1)
191  }
192 }
193 
195  int32_t num_gibbs_steps, int32_t batch_size)
196 {
198  "Visible group index (%i) out of bounds (%i)\n", V, m_num_visible);
199 
200  sample(num_gibbs_steps, batch_size);
201 
203 
204  int32_t offset = m_visible_state_offsets->element(V);
205  for (int32_t i=0; i<m_visible_group_sizes->element(V); i++)
206  for (int32_t j=0; j<m_batch_size; j++)
207  result(i,j) = visible_state(i+offset,j);
208 
209  return new CDenseFeatures<float64_t>(result);
210 }
211 
213  int32_t E, CDenseFeatures< float64_t >* evidence, int32_t num_gibbs_steps)
214 {
216  "Visible group index (%i) out of bounds (%i)\n", E, m_num_visible);
217 
218  set_batch_size(evidence->get_num_vectors());
219 
220  SGMatrix<float64_t> evidence_matrix = evidence->get_feature_matrix();
221 
222  int32_t offset = m_visible_state_offsets->element(E);
223 
224  for (int32_t i=0; i<m_visible_group_sizes->element(E); i++)
225  for (int32_t j=0; j<m_batch_size; j++)
226  visible_state(i+offset,j) = evidence_matrix(i,j);
227 
228  for (int32_t n=0; n<num_gibbs_steps; n++)
229  {
233  if (n<num_gibbs_steps-1)
234  {
235  for (int32_t k=0; k<m_num_visible_groups; k++)
236  if (k!=E)
238  }
239 
240  for (int32_t i=0; i<m_visible_group_sizes->element(E); i++)
241  for (int32_t j=0; j<m_batch_size; j++)
242  visible_state(i+offset,j) = evidence_matrix(i,j);
243  }
244 }
245 
247  int32_t E, CDenseFeatures< float64_t >* evidence, int32_t num_gibbs_steps)
248 {
250  "Visible group index (%i) out of bounds (%i)\n", V, m_num_visible);
252  "Visible group index (%i) out of bounds (%i)\n", E, m_num_visible);
253 
254  sample_with_evidence(E, evidence, num_gibbs_steps);
255 
257 
258  int32_t offset = m_visible_state_offsets->element(V);
259  for (int32_t i=0; i<m_visible_group_sizes->element(V); i++)
260  for (int32_t j=0; j<m_batch_size; j++)
261  result(i,j) = visible_state(i+offset,j);
262 
263  return new CDenseFeatures<float64_t>(result);
264 }
265 
267 {
268  for (int32_t i=0; i<m_num_visible; i++)
269  for (int32_t j=0; j<m_batch_size; j++)
270  visible_state(i,j) = CMath::random(0.0,1.0) > 0.5;
271 }
272 
274 {
275  set_batch_size(visible.num_cols);
276 
277  if (buffer.num_rows==0)
279 
280  typedef Eigen::Map<Eigen::MatrixXd> EMatrix;
281  typedef Eigen::Map<Eigen::VectorXd> EVector;
282 
283  EMatrix V(visible.matrix, visible.num_rows, visible.num_cols);
284  EMatrix W(get_weights().matrix, m_num_hidden, m_num_visible);
285  EVector B(get_visible_bias().vector, m_num_visible);
286  EVector C(get_hidden_bias().vector, m_num_hidden);
287 
288  EVector bv_buffer(buffer.matrix, m_batch_size);
289  EMatrix wv_buffer(buffer.matrix, m_num_hidden, m_batch_size);
290 
291  bv_buffer = B.transpose()*V;
292  float64_t bv_term = bv_buffer.sum();
293 
294  wv_buffer.colwise() = C;
295  wv_buffer += W*V;
296 
297  float64_t wv_term = 0;
298  for (int32_t i=0; i<m_num_hidden; i++)
299  for (int32_t j=0; j<m_batch_size; j++)
300  wv_term += CMath::log(1.0+CMath::exp(wv_buffer(i,j)));
301 
302  float64_t F = -1.0*(bv_term+wv_term)/m_batch_size;
303 
304  for (int32_t k=0; k<m_num_visible_groups; k++)
305  {
307  {
308  int32_t offset = m_visible_state_offsets->element(k);
309 
310  for (int32_t i=0; i<m_visible_group_sizes->element(k); i++)
311  for (int32_t j=0; j<m_batch_size; j++)
312  F += 0.5*CMath::pow(visible(i+offset,j),2)/m_batch_size;
313  }
314  }
315 
316  return F;
317 }
318 
320  SGVector< float64_t > gradients,
321  bool positive_phase,
322  SGMatrix< float64_t > hidden_mean_given_visible)
323 {
324  set_batch_size(visible.num_cols);
325 
326  if (hidden_mean_given_visible.num_rows==0)
327  {
328  hidden_mean_given_visible = SGMatrix<float64_t>(m_num_hidden,m_batch_size);
329  mean_hidden(visible, hidden_mean_given_visible);
330  }
331 
332  typedef Eigen::Map<Eigen::MatrixXd> EMatrix;
333  typedef Eigen::Map<Eigen::VectorXd> EVector;
334 
335  EMatrix V(visible.matrix, visible.num_rows, visible.num_cols);
336  EMatrix PH(hidden_mean_given_visible.matrix, m_num_hidden,m_batch_size);
337 
338  EMatrix WG(get_weights(gradients).matrix, m_num_hidden, m_num_visible);
339  EVector BG(get_visible_bias(gradients).vector, m_num_visible);
340  EVector CG(get_hidden_bias(gradients).vector, m_num_hidden);
341 
342  if (positive_phase)
343  {
344  WG = -1*PH*V.transpose()/m_batch_size;
345  BG = -1*V.rowwise().sum()/m_batch_size;
346  CG = -1*PH.rowwise().sum()/m_batch_size;
347  }
348  else
349  {
350  WG += PH*V.transpose()/m_batch_size;
351  BG += V.rowwise().sum()/m_batch_size;
352  CG += PH.rowwise().sum()/m_batch_size;
353  }
354 }
355 
357  SGVector< float64_t > gradients)
358 {
359  set_batch_size(visible_batch.num_cols);
360 
361  // positive phase
362  mean_hidden(visible_batch, hidden_state);
363  free_energy_gradients(visible_batch, gradients, true, hidden_state);
364 
365  // sampling
366  for (int32_t i=0; i<cd_num_steps; i++)
367  {
368  if (i>0 || cd_persistent)
372  if (cd_sample_visible)
374  }
375 
376  // negative phase
379 
380  // regularization
381  if (l2_coefficient>0)
382  {
383  int32_t len = m_num_hidden*m_num_visible;
384  for (int32_t i=0; i<len; i++)
385  gradients[i+m_num_visible+m_num_hidden] +=
386  l2_coefficient * m_params[i+m_num_visible+m_num_hidden];
387  }
388 
389  if (l1_coefficient>0)
390  {
391  int32_t len = m_num_hidden*m_num_visible;
392  for (int32_t i=0; i<len; i++)
393  gradients[i+m_num_visible+m_num_hidden] +=
394  l1_coefficient * m_params[i+m_num_visible+m_num_hidden];
395  }
396 
397 }
398 
400  SGMatrix< float64_t > buffer)
401 {
402  set_batch_size(visible.num_cols);
403 
404  if (buffer.num_rows==0)
406 
407  mean_hidden(visible, hidden_state);
409  mean_visible(hidden_state, buffer);
410 
411  float64_t error = 0;
412 
413  int32_t len = m_num_visible*m_batch_size;
414  for (int32_t i=0; i<len; i++)
415  error += CMath::pow(buffer[i]-visible[i],2);
416 
417  return error/m_batch_size;
418 }
419 
420 
422  SGMatrix< float64_t > buffer)
423 {
424  for (int32_t k=0; k<m_num_visible_groups; k++)
426  SG_ERROR("Pseudo-likelihood is only supported for binary visible units\n");
427 
428  set_batch_size(visible.num_cols);
429 
430  if (buffer.num_rows==0)
432 
434  for (int32_t i=0; i<m_batch_size; i++)
435  indices[i] = CMath::random(0,m_num_visible-1);
436 
437 
438  float64_t f1 = free_energy(visible, buffer);
439 
440  for (int32_t j=0; j<m_batch_size; j++)
441  visible(indices[j],j) = 1.0-visible(indices[j],j);
442 
443  float64_t f2 = free_energy(visible, buffer);
444 
445  for (int32_t j=0; j<m_batch_size; j++)
446  visible(indices[j],j) = 1.0-visible(indices[j],j);
447 
448  return m_num_visible*CMath::log(1.0/(1+CMath::exp(f1-f2)));
449 }
450 
452 {
453  typedef Eigen::Map<Eigen::MatrixXd> EMatrix;
454  typedef Eigen::Map<Eigen::VectorXd> EVector;
455 
456  EMatrix V(visible.matrix, visible.num_rows, visible.num_cols);
457  EMatrix H(result.matrix, result.num_rows, result.num_cols);
458  EMatrix W(get_weights().matrix, m_num_hidden, m_num_visible);
459  EVector C(get_hidden_bias().vector, m_num_hidden);
460 
461  H.colwise() = C;
462  H += W*V;
463 
464  int32_t len = result.num_rows*result.num_cols;
465  for (int32_t i=0; i<len; i++)
466  result[i] = 1.0/(1.0+CMath::exp(-1.0*result[i]));
467 }
468 
470 {
471  typedef Eigen::Map<Eigen::MatrixXd> EMatrix;
472  typedef Eigen::Map<Eigen::VectorXd> EVector;
473 
474  EMatrix H(hidden.matrix, hidden.num_rows, hidden.num_cols);
475  EMatrix V(result.matrix, result.num_rows, result.num_cols);
476  EMatrix W(get_weights().matrix, m_num_hidden, m_num_visible);
477  EVector B(get_visible_bias().vector, m_num_visible);
478 
479  V.colwise() = B;
480  V += W.transpose()*H;
481 
482  for (int32_t k=0; k<m_num_visible_groups; k++)
483  {
484  int32_t offset = m_visible_state_offsets->element(k);
485 
487  {
488  for (int32_t i=0; i<m_visible_group_sizes->element(k); i++)
489  for (int32_t j=0; j<m_batch_size; j++)
490  result(i+offset,j) = 1.0/(1.0+CMath::exp(-1.0*result(i+offset,j)));
491  }
493  {
494  // to avoid exponentiating large numbers, the maximum activation is
495  // subtracted from all the activations and the computations are done
496  // in thelog domain
497 
498  float64_t max = result(offset,0);
499  for (int32_t i=0; i<m_visible_group_sizes->element(k); i++)
500  for (int32_t j=0; j<m_batch_size; j++)
501  if (result(i+offset,j) > max)
502  max = result(i+offset,j);
503 
504  for (int32_t j=0; j<m_batch_size; j++)
505  {
506  float64_t sum = 0;
507  for (int32_t i=0; i<m_visible_group_sizes->element(k); i++)
508  sum += CMath::exp(result(i+offset,j)-max);
509 
510  float64_t normalizer = CMath::log(sum);
511 
512  for (int32_t i=0; i<m_visible_group_sizes->element(k); i++)
513  result(i+offset,j) =
514  CMath::exp(result(i+offset,j)-max-normalizer);
515  }
516  }
517  }
518 }
519 
521 {
522  int32_t length = result.num_rows*result.num_cols;
523  for (int32_t i=0; i<length; i++)
524  result[i] = CMath::random(0.0,1.0) < mean[i];
525 }
526 
528 {
529  for (int32_t k=0; k<m_num_visible_groups; k++)
530  {
531  sample_visible(k, mean, result);
532  }
533 }
534 
535 void CRBM::sample_visible(int32_t index,
537 {
538  int32_t offset = m_visible_state_offsets->element(index);
539 
541  {
542  for (int32_t i=0; i<m_visible_group_sizes->element(index); i++)
543  for (int32_t j=0; j<m_batch_size; j++)
544  result(i+offset,j) = CMath::random(0.0,1.0) < mean(i+offset,j);
545  }
546 
548  {
549  for (int32_t i=0; i<m_visible_group_sizes->element(index); i++)
550  for (int32_t j=0; j<m_batch_size; j++)
551  result(i+offset,j) = 0;
552 
553  for (int32_t j=0; j<m_batch_size; j++)
554  {
555  int32_t r = CMath::random(0.0,1.0);
556  float64_t sum = 0;
557  for (int32_t i=0; i<m_visible_group_sizes->element(index); i++)
558  {
559  sum += mean(i+offset,j);
560  if (r<=sum)
561  {
562  result(i+offset,j) = 1;
563  break;
564  }
565  }
566  }
567  }
568 }
569 
570 
572 {
573  if (p.vlen==0)
575  m_num_hidden, m_num_visible, false);
576  else
578  m_num_hidden, m_num_visible, false);
579 }
580 
582 {
583  if (p.vlen==0)
585  m_num_hidden, false);
586  else
588  m_num_hidden, false);
589 }
590 
592 {
593  if (p.vlen==0)
595  else
596  return SGVector<float64_t>(p.vector, m_num_visible, false);
597 }
598 
599 void CRBM::init()
600 {
601  cd_num_steps = 1;
602  cd_persistent = true;
603  cd_sample_visible = false;
604  l2_coefficient = 0.0;
605  l1_coefficient = 0.0;
607  monitoring_interval = 10;
608 
609  gd_mini_batch_size = 0;
610  max_num_epochs = 1;
611  gd_learning_rate = 0.1;
613  gd_momentum = 0.9;
614 
615  m_num_hidden = 0;
616  m_num_visible = 0;
621  m_num_params = 0;
622  m_batch_size = 0;
623 
624  SG_ADD(&cd_num_steps, "cd_num_steps", "Number of CD Steps", MS_NOT_AVAILABLE);
625  SG_ADD(&cd_persistent, "cd_persistent", "Whether to use PCD", MS_NOT_AVAILABLE);
626  SG_ADD(&cd_sample_visible, "sample_visible",
627  "Whether to sample the visible units during (P)CD", MS_NOT_AVAILABLE);
628  SG_ADD(&l2_coefficient, "l2_coefficient",
629  "L2 regularization coeff", MS_NOT_AVAILABLE);
630  SG_ADD(&l1_coefficient, "l1_coefficient",
631  "L1 regularization coeff", MS_NOT_AVAILABLE);
632  SG_ADD((machine_int_t*)&monitoring_method, "monitoring_method",
633  "Monitoring Method", MS_NOT_AVAILABLE);
634  SG_ADD(&monitoring_interval, "monitoring_interval",
635  "Monitoring Interval", MS_NOT_AVAILABLE);
636 
637  SG_ADD(&gd_mini_batch_size, "gd_mini_batch_size",
638  "Gradient Descent Mini-batch size", MS_NOT_AVAILABLE);
639  SG_ADD(&max_num_epochs, "max_num_epochs",
640  "Max number of Epochs", MS_NOT_AVAILABLE);
641  SG_ADD(&gd_learning_rate, "gd_learning_rate",
642  "Gradient descent learning rate", MS_NOT_AVAILABLE);
643  SG_ADD(&gd_learning_rate_decay, "gd_learning_rate_decay",
644  "Gradient descent learning rate decay", MS_NOT_AVAILABLE);
645  SG_ADD(&gd_momentum, "gd_momentum",
646  "Gradient Descent Momentum", MS_NOT_AVAILABLE);
647 
648  SG_ADD(&m_num_hidden, "num_hidden",
649  "Number of Hidden Units", MS_NOT_AVAILABLE);
650  SG_ADD(&m_num_visible, "num_visible",
651  "Number of Visible Units", MS_NOT_AVAILABLE);
652 
653  SG_ADD(&m_num_visible_groups, "num_visible_groups",
654  "Number of Visible Unit Groups", MS_NOT_AVAILABLE);
655  SG_ADD((CSGObject**)&m_visible_group_sizes, "visible_group_sizes",
656  "Sizes of Visible Unit Groups", MS_NOT_AVAILABLE);
657  SG_ADD((CSGObject**)&m_visible_group_types, "visible_group_types",
658  "Types of Visible Unit Groups", MS_NOT_AVAILABLE);
659  SG_ADD((CSGObject**)&m_visible_state_offsets, "visible_group_index_offsets",
660  "State Index offsets of Visible Unit Groups", MS_NOT_AVAILABLE);
661 
662  SG_ADD(&m_num_params, "num_params",
663  "Number of Parameters", MS_NOT_AVAILABLE);
664  SG_ADD(&m_params, "params", "Parameters", MS_NOT_AVAILABLE);
665 }
666 
667 #endif

SHOGUN Machine Learning Toolbox - Documentation