SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
MultiLaplacianInferenceMethod.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  *
31  * Code adapted from
32  * https://gist.github.com/yorkerlin/14ace49f2278f3859614
33  * Gaussian Process Machine Learning Toolbox
34  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
35  * and
36  * GPstuff - Gaussian process models for Bayesian analysis
37  * http://becs.aalto.fi/en/research/bayes/gpstuff/
38  *
39  * The reference pseudo code is the algorithm 3.3 of the GPML textbook
40  */
41 
43 
44 #ifdef HAVE_EIGEN3
48 #include <shogun/lib/external/brent.h>
49 
50 using namespace shogun;
51 using namespace Eigen;
52 
53 namespace shogun
54 {
55 
56 #ifndef DOXYGEN_SHOULD_SKIP_THIS
57 
59 class CMultiPsiLine : public func_base
60 {
61 public:
62  float64_t log_scale;
63  MatrixXd K;
64  VectorXd dalpha;
65  VectorXd start_alpha;
66  Map<VectorXd>* alpha;
70  CLikelihoodModel* lik;
71  CLabels* lab;
72 
73  virtual double operator() (double x)
74  {
75  const index_t C=((CMulticlassLabels*)lab)->get_num_classes();
76  const index_t n=((CMulticlassLabels*)lab)->get_num_labels();
77  Map<VectorXd> eigen_f(f->vector, f->vlen);
78  Map<VectorXd> eigen_m(m->vector, m->vlen);
79 
80  // compute alpha=alpha+x*dalpha and f=K*alpha+m
81  (*alpha)=start_alpha+x*dalpha;
82 
83  float64_t result=0;
84  for(index_t bl=0; bl<C; bl++)
85  {
86  eigen_f.block(bl*n,0,n,1)=K*alpha->block(bl*n,0,n,1)*CMath::exp(log_scale*2.0);
87  result+=alpha->block(bl*n,0,n,1).dot(eigen_f.block(bl*n,0,n,1))/2.0;
88  eigen_f.block(bl*n,0,n,1)+=eigen_m;
89  }
90 
91  // get first and second derivatives of log likelihood
92  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
93 
94  result -=SGVector<float64_t>::sum(lik->get_log_probability_f(lab, *f));
95 
96  return result;
97  }
98 };
99 
100 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
101 
103 {
104  init();
105 }
106 
108  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
109  : CLaplacianInferenceBase(kern, feat, m, lab, mod)
110 {
111  init();
112 }
113 
114 void CMultiLaplacianInferenceMethod::init()
115 {
116  m_nlz=0;
117  SG_ADD(&m_nlz, "nlz", "negative log marginal likelihood ", MS_NOT_AVAILABLE);
118  SG_ADD(&m_U, "U", "the matrix used to compute gradient wrt hyperparameters", MS_NOT_AVAILABLE);
119 }
120 
122 {
123 }
124 
126 {
128 
130  "Labels must be type of CMulticlassLabels\n");
132  "likelihood model should support multi-classification\n");
133 }
134 
135 SGVector<float64_t> CMultiLaplacianInferenceMethod::get_diagonal_vector()
136 {
138  update();
139 
140  get_dpi_helper();
141 
142  return SGVector<float64_t>(m_W);
143 }
144 
146 {
148  update();
149 
150  return m_nlz;
151 }
152 
154  const TParameter* param)
155 {
156  //SoftMax likelihood does not have this kind of derivative
157  SG_ERROR("Not Implemented!\n");
158  return SGVector<float64_t> ();
159 }
160 
162  CInferenceMethod* inference)
163 {
164  if (inference==NULL)
165  return NULL;
166 
167  if (inference->get_inference_type()!=INF_LAPLACIAN_MULTIPLE)
168  SG_SERROR("Provided inference is not of type CMultiLaplacianInferenceMethod!\n")
169 
170  SG_REF(inference);
171  return (CMultiLaplacianInferenceMethod*)inference;
172 }
173 
174 
176 {
177  //Sigma=K-K*(E-E*R(M*M')^{-1}*R'*E)*K
178  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
179  const index_t n=m_labels->get_num_labels();
183 
184  m_Sigma=SGMatrix<float64_t>(C*n, C*n);
186  eigen_Sigma.fill(0);
187 
188  MatrixXd eigen_U(C*n,n);
189  for(index_t bl=0; bl<C; bl++)
190  {
191  eigen_U.block(bl*n,0,n,n)=eigen_K*CMath::exp(m_log_scale*2.0)*eigen_E.block(0,bl*n,n,n);
192  eigen_Sigma.block(bl*n,bl*n,n,n)=(MatrixXd::Identity(n,n)-eigen_U.block(bl*n,0,n,n))*(eigen_K*CMath::exp(m_log_scale*2.0));
193  }
194  MatrixXd eigen_V=eigen_M.triangularView<Upper>().adjoint().solve(eigen_U.transpose());
195  eigen_Sigma+=eigen_V.transpose()*eigen_V;
196 }
197 
199 {
200 }
201 
202 void CMultiLaplacianInferenceMethod::get_dpi_helper()
203 {
204  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
205  const index_t n=m_labels->get_num_labels();
206  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
207  Map<MatrixXd> eigen_dpi_matrix(eigen_dpi.data(),n,C);
208 
209  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
210  Map<MatrixXd> eigen_mu_matrix(eigen_mu.data(),n,C);
211  // with log_sum_exp trick
212  VectorXd max_coeff=eigen_mu_matrix.rowwise().maxCoeff();
213  eigen_dpi_matrix=eigen_mu_matrix.array().colwise()-max_coeff.array();
214  VectorXd log_sum_exp=((eigen_dpi_matrix.array().exp()).rowwise().sum()).array().log();
215  eigen_dpi_matrix=(eigen_dpi_matrix.array().colwise()-log_sum_exp.array()).exp();
216 
217  // without log_sum_exp trick
218  //eigen_dpi_matrix=eigen_mu_matrix.array().exp();
219  //VectorXd tmp_for_dpi=eigen_dpi_matrix.rowwise().sum();
220  //eigen_dpi_matrix=eigen_dpi_matrix.array().colwise()/tmp_for_dpi.array();
221 }
222 
224 {
225  float64_t Psi_Old = CMath::INFTY;
226  float64_t Psi_New;
227  float64_t Psi_Def;
228  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
229  const index_t n=m_labels->get_num_labels();
230 
231  // get mean vector and create eigen representation of it
233  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
234  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
235 
236  // create eigen representation of kernel matrix
238 
239  // create shogun and eigen representation of function vector
240  m_mu=SGVector<float64_t>(mean.vlen*C);
241  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
242 
243  // f = mean as default value
244  eigen_mu=eigen_mean;
245 
247  m_labels, m_mu));
248 
249  if (m_alpha.vlen!=C*n)
250  {
251  // set alpha a zero vector
253  m_alpha.zero();
254  Psi_New=Psi_Def;
255  m_E=SGMatrix<float64_t>(n,C*n);
258  }
259  else
260  {
262  for(index_t bl=0; bl<C; bl++)
263  eigen_mu.block(bl*n,0,n,1)=eigen_ktrtr*CMath::exp(m_log_scale*2.0)*alpha.block(bl*n,0,n,1);
264 
265  //alpha'*(f-m)/2.0
266  Psi_New=alpha.dot(eigen_mu)/2.0;
267  // compute f = K * alpha + m
268  eigen_mu+=eigen_mean;
269 
271 
272  // if default is better, then use it
273  if (Psi_Def < Psi_New)
274  {
275  m_alpha.zero();
276  eigen_mu=eigen_mean;
277  Psi_New=Psi_Def;
278  }
279  }
280 
281  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
282  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
285 
286  // get first derivative of log probability function
288 
289  index_t iter=0;
290  Map<MatrixXd> & eigen_M=eigen_L;
291 
292  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
293  {
294  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
295 
296  get_dpi_helper();
297  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
298 
299 
300  Psi_Old = Psi_New;
301  iter++;
302 
303  m_nlz=0;
304 
305  for(index_t bl=0; bl<C; bl++)
306  {
307  VectorXd eigen_sD=eigen_dpi.block(bl*n,0,n,1).cwiseSqrt();
308  LLT<MatrixXd> chol_tmp((eigen_sD*eigen_sD.transpose()).cwiseProduct(eigen_ktrtr*CMath::exp(m_log_scale*2.0))+
309  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
310  MatrixXd eigen_L_tmp=chol_tmp.matrixU();
311  MatrixXd eigen_E_bl=eigen_L_tmp.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sD.asDiagonal()));
312  eigen_E_bl=eigen_E_bl.transpose()*eigen_E_bl;
313  eigen_E.block(0,bl*n,n,n)=eigen_E_bl;
314  if (bl==0)
315  eigen_M=eigen_E_bl;
316  else
317  eigen_M+=eigen_E_bl;
318  m_nlz+=eigen_L_tmp.diagonal().array().log().sum();
319  }
320 
321  LLT<MatrixXd> chol_tmp(eigen_M);
322  eigen_M = chol_tmp.matrixU();
323  m_nlz+=eigen_M.diagonal().array().log().sum();
324 
325  VectorXd eigen_b=eigen_dlp;
326  Map<VectorXd> & tmp1=eigen_dlp;
327  tmp1=eigen_dpi.array()*(eigen_mu-eigen_mean).array();
328  Map<MatrixXd> m_tmp(tmp1.data(),n,C);
329  VectorXd tmp2=m_tmp.array().rowwise().sum();
330 
331  for(index_t bl=0; bl<C; bl++)
332  eigen_b.block(bl*n,0,n,1)+=eigen_dpi.block(bl*n,0,n,1).cwiseProduct(eigen_mu.block(bl*n,0,n,1)-eigen_mean_bl-tmp2);
333 
334  Map<VectorXd> &eigen_c=eigen_W;
335  for(index_t bl=0; bl<C; bl++)
336  eigen_c.block(bl*n,0,n,1)=eigen_E.block(0,bl*n,n,n)*(eigen_ktrtr*CMath::exp(m_log_scale*2.0)*eigen_b.block(bl*n,0,n,1));
337 
338  Map<MatrixXd> c_tmp(eigen_c.data(),n,C);
339 
340  VectorXd tmp3=c_tmp.array().rowwise().sum();
341  VectorXd tmp4=eigen_M.triangularView<Upper>().adjoint().solve(tmp3);
342 
343  VectorXd &eigen_dalpha=eigen_b;
344  eigen_dalpha+=eigen_E.transpose()*(eigen_M.triangularView<Upper>().solve(tmp4))-eigen_c-eigen_alpha;
345 
346  // perform Brent's optimization
347  CMultiPsiLine func;
348 
349  func.log_scale=m_log_scale;
350  func.K=eigen_ktrtr;
351  func.dalpha=eigen_dalpha;
352  func.start_alpha=eigen_alpha;
353  func.alpha=&eigen_alpha;
354  func.dlp=&m_dlp;
355  func.f=&m_mu;
356  func.m=&mean;
357  func.lik=m_model;
358  func.lab=m_labels;
359 
360  float64_t x;
361  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
362  m_nlz+=Psi_New;
363  }
364 
365  if (Psi_Old-Psi_New>m_tolerance && iter>=m_iter)
366  {
367  SG_WARNING("Max iterations (%d) reached, but convergence level (%f) is not yet below tolerance (%f)\n", m_iter, Psi_Old-Psi_New, m_tolerance);
368  }
369 }
370 
372 {
373  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
374  const index_t n=m_labels->get_num_labels();
375  m_U=SGMatrix<float64_t>(n, n*C);
379  eigen_U=eigen_M.triangularView<Upper>().adjoint().solve(eigen_E);
380 }
381 
383 {
384  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
385  //currently only explicit term is computed
386  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
387  const index_t n=m_labels->get_num_labels();
390  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
391  float64_t result=0;
392  //currently only explicit term is computed
393  for(index_t bl=0; bl<C; bl++)
394  {
395  result+=((eigen_E.block(0,bl*n,n,n)-eigen_U.block(0,bl*n,n,n).transpose()*eigen_U.block(0,bl*n,n,n)).array()
396  *eigen_dK.array()).sum();
397  result-=(eigen_dK*eigen_alpha.block(bl*n,0,n,1)).dot(eigen_alpha.block(bl*n,0,n,1));
398  }
399 
400  return result/2.0;
401 }
402 
404  const TParameter* param)
405 {
406  REQUIRE(!strcmp(param->m_name, "log_scale"), "Can't compute derivative of "
407  "the nagative log marginal likelihood wrt %s.%s parameter\n",
408  get_name(), param->m_name)
409 
411 
412  SGVector<float64_t> result(1);
413 
414  // compute derivative K wrt scale
415 
416  result[0]=get_derivative_helper(m_ktrtr);
417  result[0]*=CMath::exp(m_log_scale*2.0)*2.0;
418 
419  return result;
420 }
421 
423  const TParameter* param)
424 {
425  // create eigen representation of K, Z, dfhat, dlp and alpha
427 
428  REQUIRE(param, "Param not set\n");
429  SGVector<float64_t> result;
430  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
431  result=SGVector<float64_t>(len);
432 
433  for (index_t i=0; i<result.vlen; i++)
434  {
436 
437  if (result.vlen==1)
438  dK=m_kernel->get_parameter_gradient(param);
439  else
440  dK=m_kernel->get_parameter_gradient(param, i);
441 
442  result[i]=get_derivative_helper(dK);
443  result[i]*=CMath::exp(m_log_scale*2.0);
444  }
445 
446  return result;
447 }
448 
450  const TParameter* param)
451 {
452  // create eigen representation of K, Z, dfhat and alpha
454  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
455  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
456  const index_t n=m_labels->get_num_labels();
457 
458  REQUIRE(param, "Param not set\n");
459  SGVector<float64_t> result;
460  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
461  result=SGVector<float64_t>(len);
462 
463  for (index_t i=0; i<result.vlen; i++)
464  {
466 
467  if (result.vlen==1)
469  else
471 
472  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
473 
474  result[i]=0;
475  //currently only compute the explicit term
476  for(index_t bl=0; bl<C; bl++)
477  result[i]-=eigen_alpha.block(bl*n,0,n,1).dot(eigen_dmu);
478  }
479 
480  return result;
481 }
482 
484 {
486 
488  Map<VectorXd> eigen_res(res.vector, res.vlen);
489  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
490 
492  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
493  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
494 
495  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
496  eigen_res=eigen_mu-eigen_mean;
497 
498  return res;
499 }
500 
501 
502 }
503 
504 #endif /* HAVE_EIGEN3 */
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
virtual bool supports_multiclass() const
virtual ELabelType get_label_type() const =0
SGVector< float64_t > m_alpha
The Inference Method base class.
virtual float64_t get_derivative_helper(SGMatrix< float64_t > dK)
int32_t index_t
Definition: common.h:62
Vector::Scalar dot(Vector a, Vector b)
Definition: Redux.h:56
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:2048
virtual int32_t get_num_labels() const =0
multi-class labels 0,1,...
Definition: LabelTypes.h:20
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
Definition: SGMatrix.h:20
parameter struct
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
The Laplace approximation inference method base class.
index_t num_cols
Definition: SGMatrix.h:378
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:49
#define SG_REF(x)
Definition: SGObject.h:51
index_t num_rows
Definition: SGMatrix.h:376
static CMultiLaplacianInferenceMethod * obtain_from_generic(CInferenceMethod *inference)
Multiclass Labels for multi-class classification.
index_t vlen
Definition: SGVector.h:494
SGMatrix< float64_t > m_L
double float64_t
Definition: common.h:50
SGMatrix< float64_t > m_E
The Laplace approximation inference method class for multi classification.
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:354
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:73
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
The class Features is the base class of all feature objects.
Definition: Features.h:68
#define SG_SERROR(...)
Definition: SGIO.h:179
static float64_t exp(float64_t x)
Definition: Math.h:621
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:850
virtual void check_members() const
virtual EInferenceType get_inference_type() const
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
The Kernel base class.
Definition: Kernel.h:158
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:81
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:262
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
CLikelihoodModel * m_model

SHOGUN 机器学习工具包 - 项目文档