SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
MultiLaplaceInferenceMethod.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  *
31  * Code adapted from
32  * https://gist.github.com/yorkerlin/14ace49f2278f3859614
33  * Gaussian Process Machine Learning Toolbox
34  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
35  * and
36  * GPstuff - Gaussian process models for Bayesian analysis
37  * http://becs.aalto.fi/en/research/bayes/gpstuff/
38  *
39  * The reference pseudo code is the algorithm 3.3 of the GPML textbook
40  */
41 
43 
47 #include <shogun/lib/external/brent.h>
48 
49 using namespace shogun;
50 using namespace Eigen;
51 
52 namespace shogun
53 {
54 
55 #ifndef DOXYGEN_SHOULD_SKIP_THIS
56 
58 class CMultiPsiLine : public func_base
59 {
60 public:
61  float64_t log_scale;
62  MatrixXd K;
63  VectorXd dalpha;
64  VectorXd start_alpha;
65  Map<VectorXd>* alpha;
69  CLikelihoodModel* lik;
70  CLabels* lab;
71 
72  virtual double operator() (double x)
73  {
74  const index_t C=((CMulticlassLabels*)lab)->get_num_classes();
75  const index_t n=((CMulticlassLabels*)lab)->get_num_labels();
76  Map<VectorXd> eigen_f(f->vector, f->vlen);
77  Map<VectorXd> eigen_m(m->vector, m->vlen);
78 
79  // compute alpha=alpha+x*dalpha and f=K*alpha+m
80  (*alpha)=start_alpha+x*dalpha;
81 
82  float64_t result=0;
83  for(index_t bl=0; bl<C; bl++)
84  {
85  eigen_f.block(bl*n,0,n,1)=K*alpha->block(bl*n,0,n,1)*CMath::exp(log_scale*2.0);
86  result+=alpha->block(bl*n,0,n,1).dot(eigen_f.block(bl*n,0,n,1))/2.0;
87  eigen_f.block(bl*n,0,n,1)+=eigen_m;
88  }
89 
90  // get first and second derivatives of log likelihood
91  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
92 
93  result -=SGVector<float64_t>::sum(lik->get_log_probability_f(lab, *f));
94 
95  return result;
96  }
97 };
98 
99 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
100 
102 {
103  init();
104 }
105 
107  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
108  : CLaplaceInference(kern, feat, m, lab, mod)
109 {
110  init();
111 }
112 
113 void CMultiLaplaceInferenceMethod::init()
114 {
115  m_iter=20;
116  m_tolerance=1e-6;
117  m_opt_tolerance=1e-10;
118  m_opt_max=10;
119 
120  m_nlz=0;
121  SG_ADD(&m_nlz, "nlz", "negative log marginal likelihood ", MS_NOT_AVAILABLE);
122  SG_ADD(&m_U, "U", "the matrix used to compute gradient wrt hyperparameters", MS_NOT_AVAILABLE);
123 
124  SG_ADD(&m_tolerance, "tolerance", "amount of tolerance for Newton's iterations", MS_NOT_AVAILABLE);
125  SG_ADD(&m_iter, "iter", "max Newton's iterations", MS_NOT_AVAILABLE);
126  SG_ADD(&m_opt_tolerance, "opt_tolerance", "amount of tolerance for Brent's minimization method", MS_NOT_AVAILABLE);
127  SG_ADD(&m_opt_max, "opt_max", "max iterations for Brent's minimization method", MS_NOT_AVAILABLE);
128 }
129 
131 {
132 }
133 
135 {
137 
139  "Labels must be type of CMulticlassLabels\n");
141  "likelihood model should support multi-classification\n");
142 }
143 
145 {
147  update();
148 
149  get_dpi_helper();
150 
151  return SGVector<float64_t>(m_W);
152 }
153 
155 {
157  update();
158 
159  return m_nlz;
160 }
161 
163  const TParameter* param)
164 {
165  //SoftMax likelihood does not have this kind of derivative
166  SG_ERROR("Not Implemented!\n");
167  return SGVector<float64_t> ();
168 }
169 
171  CInference* inference)
172 {
173  if (inference==NULL)
174  return NULL;
175 
176  if (inference->get_inference_type()!=INF_LAPLACE_MULTIPLE)
177  SG_SERROR("Provided inference is not of type CMultiLaplaceInferenceMethod!\n")
178 
179  SG_REF(inference);
180  return (CMultiLaplaceInferenceMethod*)inference;
181 }
182 
183 
185 {
186  //Sigma=K-K*(E-E*R(M*M')^{-1}*R'*E)*K
187  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
188  const index_t n=m_labels->get_num_labels();
192 
193  m_Sigma=SGMatrix<float64_t>(C*n, C*n);
195  eigen_Sigma.fill(0);
196 
197  MatrixXd eigen_U(C*n,n);
198  for(index_t bl=0; bl<C; bl++)
199  {
200  eigen_U.block(bl*n,0,n,n)=eigen_K*CMath::exp(m_log_scale*2.0)*eigen_E.block(0,bl*n,n,n);
201  eigen_Sigma.block(bl*n,bl*n,n,n)=(MatrixXd::Identity(n,n)-eigen_U.block(bl*n,0,n,n))*(eigen_K*CMath::exp(m_log_scale*2.0));
202  }
203  MatrixXd eigen_V=eigen_M.triangularView<Upper>().adjoint().solve(eigen_U.transpose());
204  eigen_Sigma+=eigen_V.transpose()*eigen_V;
205 }
206 
208 {
209 }
210 
212 {
213  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
214  const index_t n=m_labels->get_num_labels();
215  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
216  Map<MatrixXd> eigen_dpi_matrix(eigen_dpi.data(),n,C);
217 
218  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
219  Map<MatrixXd> eigen_mu_matrix(eigen_mu.data(),n,C);
220  // with log_sum_exp trick
221  VectorXd max_coeff=eigen_mu_matrix.rowwise().maxCoeff();
222  eigen_dpi_matrix=eigen_mu_matrix.array().colwise()-max_coeff.array();
223  VectorXd log_sum_exp=((eigen_dpi_matrix.array().exp()).rowwise().sum()).array().log();
224  eigen_dpi_matrix=(eigen_dpi_matrix.array().colwise()-log_sum_exp.array()).exp();
225 
226  // without log_sum_exp trick
227  //eigen_dpi_matrix=eigen_mu_matrix.array().exp();
228  //VectorXd tmp_for_dpi=eigen_dpi_matrix.rowwise().sum();
229  //eigen_dpi_matrix=eigen_dpi_matrix.array().colwise()/tmp_for_dpi.array();
230 }
231 
233 {
234  float64_t Psi_Old = CMath::INFTY;
235  float64_t Psi_New;
236  float64_t Psi_Def;
237  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
238  const index_t n=m_labels->get_num_labels();
239 
240  // get mean vector and create eigen representation of it
242  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
243  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
244 
245  // create eigen representation of kernel matrix
247 
248  // create shogun and eigen representation of function vector
249  m_mu=SGVector<float64_t>(mean.vlen*C);
250  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
251 
252  // f = mean as default value
253  eigen_mu=eigen_mean;
254 
256  m_labels, m_mu));
257 
258  if (m_alpha.vlen!=C*n)
259  {
260  // set alpha a zero vector
262  m_alpha.zero();
263  Psi_New=Psi_Def;
264  m_E=SGMatrix<float64_t>(n,C*n);
267  }
268  else
269  {
271  for(index_t bl=0; bl<C; bl++)
272  eigen_mu.block(bl*n,0,n,1)=eigen_ktrtr*CMath::exp(m_log_scale*2.0)*alpha.block(bl*n,0,n,1);
273 
274  //alpha'*(f-m)/2.0
275  Psi_New=alpha.dot(eigen_mu)/2.0;
276  // compute f = K * alpha + m
277  eigen_mu+=eigen_mean;
278 
280 
281  // if default is better, then use it
282  if (Psi_Def < Psi_New)
283  {
284  m_alpha.zero();
285  eigen_mu=eigen_mean;
286  Psi_New=Psi_Def;
287  }
288  }
289 
290  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
291  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
294 
295  // get first derivative of log probability function
297 
298  index_t iter=0;
299  Map<MatrixXd> & eigen_M=eigen_L;
300 
301  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
302  {
303  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
304 
305  get_dpi_helper();
306  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
307 
308 
309  Psi_Old = Psi_New;
310  iter++;
311 
312  m_nlz=0;
313 
314  for(index_t bl=0; bl<C; bl++)
315  {
316  VectorXd eigen_sD=eigen_dpi.block(bl*n,0,n,1).cwiseSqrt();
317  LLT<MatrixXd> chol_tmp((eigen_sD*eigen_sD.transpose()).cwiseProduct(eigen_ktrtr*CMath::exp(m_log_scale*2.0))+
318  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
319  MatrixXd eigen_L_tmp=chol_tmp.matrixU();
320  MatrixXd eigen_E_bl=eigen_L_tmp.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sD.asDiagonal()));
321  eigen_E_bl=eigen_E_bl.transpose()*eigen_E_bl;
322  eigen_E.block(0,bl*n,n,n)=eigen_E_bl;
323  if (bl==0)
324  eigen_M=eigen_E_bl;
325  else
326  eigen_M+=eigen_E_bl;
327  m_nlz+=eigen_L_tmp.diagonal().array().log().sum();
328  }
329 
330  LLT<MatrixXd> chol_tmp(eigen_M);
331  eigen_M = chol_tmp.matrixU();
332  m_nlz+=eigen_M.diagonal().array().log().sum();
333 
334  VectorXd eigen_b=eigen_dlp;
335  Map<VectorXd> & tmp1=eigen_dlp;
336  tmp1=eigen_dpi.array()*(eigen_mu-eigen_mean).array();
337  Map<MatrixXd> m_tmp(tmp1.data(),n,C);
338  VectorXd tmp2=m_tmp.array().rowwise().sum();
339 
340  for(index_t bl=0; bl<C; bl++)
341  eigen_b.block(bl*n,0,n,1)+=eigen_dpi.block(bl*n,0,n,1).cwiseProduct(eigen_mu.block(bl*n,0,n,1)-eigen_mean_bl-tmp2);
342 
343  Map<VectorXd> &eigen_c=eigen_W;
344  for(index_t bl=0; bl<C; bl++)
345  eigen_c.block(bl*n,0,n,1)=eigen_E.block(0,bl*n,n,n)*(eigen_ktrtr*CMath::exp(m_log_scale*2.0)*eigen_b.block(bl*n,0,n,1));
346 
347  Map<MatrixXd> c_tmp(eigen_c.data(),n,C);
348 
349  VectorXd tmp3=c_tmp.array().rowwise().sum();
350  VectorXd tmp4=eigen_M.triangularView<Upper>().adjoint().solve(tmp3);
351 
352  VectorXd &eigen_dalpha=eigen_b;
353  eigen_dalpha+=eigen_E.transpose()*(eigen_M.triangularView<Upper>().solve(tmp4))-eigen_c-eigen_alpha;
354 
355  // perform Brent's optimization
356  CMultiPsiLine func;
357 
358  func.log_scale=m_log_scale;
359  func.K=eigen_ktrtr;
360  func.dalpha=eigen_dalpha;
361  func.start_alpha=eigen_alpha;
362  func.alpha=&eigen_alpha;
363  func.dlp=&m_dlp;
364  func.f=&m_mu;
365  func.m=&mean;
366  func.lik=m_model;
367  func.lab=m_labels;
368 
369  float64_t x;
370  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
371  m_nlz+=Psi_New;
372  }
373 
374  if (Psi_Old-Psi_New>m_tolerance && iter>=m_iter)
375  {
376  SG_WARNING("Max iterations (%d) reached, but convergence level (%f) is not yet below tolerance (%f)\n", m_iter, Psi_Old-Psi_New, m_tolerance);
377  }
378 }
379 
381 {
382  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
383  const index_t n=m_labels->get_num_labels();
384  m_U=SGMatrix<float64_t>(n, n*C);
388  eigen_U=eigen_M.triangularView<Upper>().adjoint().solve(eigen_E);
389 }
390 
392 {
393  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
394  //currently only explicit term is computed
395  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
396  const index_t n=m_labels->get_num_labels();
399  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
400  float64_t result=0;
401  //currently only explicit term is computed
402  for(index_t bl=0; bl<C; bl++)
403  {
404  result+=((eigen_E.block(0,bl*n,n,n)-eigen_U.block(0,bl*n,n,n).transpose()*eigen_U.block(0,bl*n,n,n)).array()
405  *eigen_dK.array()).sum();
406  result-=(eigen_dK*eigen_alpha.block(bl*n,0,n,1)).dot(eigen_alpha.block(bl*n,0,n,1));
407  }
408 
409  return result/2.0;
410 }
411 
413  const TParameter* param)
414 {
415  REQUIRE(!strcmp(param->m_name, "log_scale"), "Can't compute derivative of "
416  "the nagative log marginal likelihood wrt %s.%s parameter\n",
417  get_name(), param->m_name)
418 
420 
421  SGVector<float64_t> result(1);
422 
423  // compute derivative K wrt scale
424 
425  result[0]=get_derivative_helper(m_ktrtr);
426  result[0]*=CMath::exp(m_log_scale*2.0)*2.0;
427 
428  return result;
429 }
430 
432  const TParameter* param)
433 {
434  // create eigen representation of K, Z, dfhat, dlp and alpha
436 
437  REQUIRE(param, "Param not set\n");
438  SGVector<float64_t> result;
439  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
440  result=SGVector<float64_t>(len);
441 
442  for (index_t i=0; i<result.vlen; i++)
443  {
445 
446  if (result.vlen==1)
447  dK=m_kernel->get_parameter_gradient(param);
448  else
449  dK=m_kernel->get_parameter_gradient(param, i);
450 
451  result[i]=get_derivative_helper(dK);
452  result[i]*=CMath::exp(m_log_scale*2.0);
453  }
454 
455  return result;
456 }
457 
459  const TParameter* param)
460 {
461  // create eigen representation of K, Z, dfhat and alpha
463  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
464  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
465  const index_t n=m_labels->get_num_labels();
466 
467  REQUIRE(param, "Param not set\n");
468  SGVector<float64_t> result;
469  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
470  result=SGVector<float64_t>(len);
471 
472  for (index_t i=0; i<result.vlen; i++)
473  {
475 
476  if (result.vlen==1)
478  else
480 
481  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
482 
483  result[i]=0;
484  //currently only compute the explicit term
485  for(index_t bl=0; bl<C; bl++)
486  result[i]-=eigen_alpha.block(bl*n,0,n,1).dot(eigen_dmu);
487  }
488 
489  return result;
490 }
491 
493 {
495 
497  Map<VectorXd> eigen_res(res.vector, res.vlen);
498  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
499 
501  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
502  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
503 
504  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
505  eigen_res=eigen_mu-eigen_mean;
506 
507  return res;
508 }
509 
510 
511 }
512 
float64_t m_log_scale
Definition: Inference.h:490
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
virtual bool supports_multiclass() const
virtual float64_t get_derivative_helper(SGMatrix< float64_t > dK)
virtual ELabelType get_label_type() const =0
SGVector< float64_t > m_mu
int32_t index_t
Definition: common.h:62
Vector::Scalar dot(Vector a, Vector b)
Definition: Redux.h:58
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:2048
virtual EInferenceType get_inference_type() const
Definition: Inference.h:104
virtual int32_t get_num_labels() const =0
The Laplace approximation inference method class for multi classification.
multi-class labels 0,1,...
Definition: LabelTypes.h:20
CKernel * m_kernel
Definition: Inference.h:469
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
Definition: SGMatrix.h:20
virtual SGVector< float64_t > get_diagonal_vector()
parameter struct
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:376
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
virtual SGVector< float64_t > get_posterior_mean()
SGMatrix< float64_t > m_E
Definition: Inference.h:496
An abstract class of the mean function.
Definition: MeanFunction.h:49
#define SG_REF(x)
Definition: SGObject.h:54
index_t num_rows
Definition: SGMatrix.h:374
CFeatures * m_features
Definition: Inference.h:478
SGMatrix< float64_t > m_ktrtr
Definition: Inference.h:493
CMeanFunction * m_mean
Definition: Inference.h:472
Multiclass Labels for multi-class classification.
index_t vlen
Definition: SGVector.h:494
CLabels * m_labels
Definition: Inference.h:481
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
double float64_t
Definition: common.h:50
SGVector< float64_t > m_dlp
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:354
SGMatrix< float64_t > m_L
Definition: Inference.h:487
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
Definition: KLInference.h:52
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:73
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The Laplace approximation inference method base class.
The Inference Method base class.
Definition: Inference.h:81
SGMatrix< float64_t > m_Sigma
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SERROR(...)
Definition: SGIO.h:179
static float64_t exp(float64_t x)
Definition: Math.h:621
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:851
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
SGVector< float64_t > m_W
The Kernel base class.
Definition: Kernel.h:159
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:84
static CMultiLaplaceInferenceMethod * obtain_from_generic(CInference *inference)
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
CLikelihoodModel * m_model
Definition: Inference.h:475
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:295
The Likelihood model base class.
SGVector< float64_t > m_alpha
Definition: Inference.h:484
virtual void check_members() const
Definition: Inference.cpp:322

SHOGUN Machine Learning Toolbox - Documentation