SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KLInferenceMethod.cpp
Go to the documentation of this file.
1  /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  * This code specifically adapted from function in approxKL.m and infKL.m
31  */
32 
34 
35 #ifdef HAVE_EIGEN3
38 
39 using namespace Eigen;
40 
41 namespace shogun
42 {
43 
44 CKLInferenceMethod::CKLInferenceMethod() : CInferenceMethod()
45 {
46  init();
47 }
48 
50  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
51  : CInferenceMethod(kern, feat, m, lab, mod)
52 {
53  init();
55 }
56 
58 {
59  REQUIRE(mod, "the likelihood model must not be NULL\n")
61  REQUIRE(lik,
62  "The provided likelihood model (%s) must support variational Gaussian inference. ",
63  "Please use a Variational Gaussian Likelihood model\n",
64  mod->get_name());
65 }
66 
68 {
71 }
72 
73 void CKLInferenceMethod::init()
74 {
75  m_noise_factor=1e-10;
76  m_max_attempt=0;
77  m_exp_factor=2;
78  m_min_coeff_kernel=1e-5;
79  SG_ADD(&m_noise_factor, "noise_factor",
80  "The noise factor used for correcting Kernel matrix",
82  SG_ADD(&m_exp_factor, "exp_factor",
83  "The exponential factor used for increasing noise_factor",
85  SG_ADD(&m_max_attempt, "max_attempt",
86  "The max number of attempt to correct Kernel matrix",
88  SG_ADD(&m_min_coeff_kernel, "min_coeff_kernel",
89  "The minimum coeefficient of kernel matrix in LDLT factorization used to check whether the kernel matrix is positive definite or not",
91 
93  SG_ADD(&m_m, "m",
94  "The number of corrections to approximate the inverse Hessian matrix",
96  SG_ADD(&m_max_linesearch, "max_linesearch",
97  "The maximum number of trials to do line search for each L-BFGS update",
99  SG_ADD(&m_linesearch, "linesearch",
100  "The line search algorithm",
102  SG_ADD(&m_max_iterations, "max_iterations",
103  "The maximum number of iterations for L-BFGS update",
105  SG_ADD(&m_delta, "delta",
106  "Delta for convergence test based on the change of function value",
108  SG_ADD(&m_past, "past",
109  "Distance for delta-based convergence test",
111  SG_ADD(&m_epsilon, "epsilon",
112  "Epsilon for convergence test based on the change of gradient",
114  SG_ADD(&m_min_step, "min_step",
115  "The minimum step of the line search",
117  SG_ADD(&m_max_step, "max_step",
118  "The maximum step of the line search",
120  SG_ADD(&m_ftol, "ftol",
121  "A parameter used in Armijo condition",
123  SG_ADD(&m_wolfe, "wolfe",
124  "A parameter used in curvature condition",
126  SG_ADD(&m_gtol, "gtol",
127  "A parameter used in Morethuente linesearch to control the accuracy",
129  SG_ADD(&m_xtol, "xtol",
130  "The machine precision for floating-point values",
132  SG_ADD(&m_orthantwise_c, "orthantwise_c",
133  "Coeefficient for the L1 norm of variables",
135  SG_ADD(&m_orthantwise_start, "orthantwise_start",
136  "Start index for computing L1 norm of the variables",
138  SG_ADD(&m_orthantwise_end, "orthantwise_end",
139  "End index for computing L1 norm of the variables",
141  SG_ADD(&m_s2, "s2",
142  "Variational parameter sigma2",
144  SG_ADD(&m_mu, "mu",
145  "Variational parameter mu and posterior mean",
147  SG_ADD(&m_Sigma, "Sigma",
148  "Posterior covariance matrix Sigma",
150 }
151 
153 {
154 }
155 
157 {
159 
160  if (!m_gradient_update)
161  {
163  update_deriv();
164  m_gradient_update=true;
166  }
167 }
168 
170 {
171  SG_DEBUG("entering\n");
172 
174  update_init();
175  update_alpha();
176  update_chol();
177  m_gradient_update=false;
179 
180  SG_DEBUG("leaving\n");
181 }
182 
184 {
185  REQUIRE(noise_factor>=0, "The noise_factor %.20f should be non-negative\n", noise_factor);
186  m_noise_factor=noise_factor;
187 }
188 
190 {
191  REQUIRE(min_coeff_kernel>=0, "The min_coeff_kernel %.20f should be non-negative\n", min_coeff_kernel);
192  m_min_coeff_kernel=min_coeff_kernel;
193 }
194 
196 {
197  REQUIRE(max_attempt>=0, "The max_attempt %d should be non-negative. 0 means inifity attempts\n", max_attempt);
198  m_max_attempt=max_attempt;
199 }
200 
202 {
203  REQUIRE(exp_factor>1.0, "The exp_factor %f should be greater than 1.0.\n", exp_factor);
204  m_exp_factor=exp_factor;
205 }
206 
208 {
210 }
211 
213 {
215 
216  eigen_K=eigen_K+m_noise_factor*MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols);
217 
219  ldlt.compute(eigen_K*CMath::exp(m_log_scale*2.0));
220 
221  float64_t attempt_count=0;
222  MatrixXd Kernel_D=ldlt.vectorD();
223  float64_t noise_factor=m_noise_factor;
224 
225  while (Kernel_D.minCoeff()<=m_min_coeff_kernel)
226  {
227  if (m_max_attempt>0 && attempt_count>m_max_attempt)
228  SG_ERROR("The Kernel matrix is highly non-positive definite since the min_coeff_kernel is less than %.20f",
229  " even when adding %.20f noise to the diagonal elements at max %d attempts\n",
230  m_min_coeff_kernel, noise_factor, m_max_attempt);
231  attempt_count++;
232  float64_t pre_noise_factor=noise_factor;
233  noise_factor*=m_exp_factor;
234  //updat the noise eigen_K=eigen_K+noise_factor*(m_exp_factor^attempt_count)*Identity()
235  eigen_K=eigen_K+(noise_factor-pre_noise_factor)*MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols);
236  ldlt.compute(eigen_K*CMath::exp(m_log_scale*2.0));
237  Kernel_D=ldlt.vectorD();
238  }
239 
240  return ldlt;
241 }
242 
243 
245 {
247 
248  return SGVector<float64_t>(m_mu);
249 }
250 
252 {
254 
256 }
257 
258 float64_t CKLInferenceMethod::evaluate(void *obj, const float64_t *parameters,
259  float64_t *gradient, const int dim, const float64_t step)
260 {
261  /* Note that parameters = parameters_pre_iter - step * gradient_pre_iter */
262  CKLInferenceMethod * obj_prt
263  = static_cast<CKLInferenceMethod *>(obj);
264 
265  REQUIRE(obj_prt, "The instance object passed to L-BFGS optimizer should not be NULL\n");
266 
267  bool status = obj_prt->lbfgs_precompute();
268  if (!status)
269  return CMath::NOT_A_NUMBER;
270 
271  float64_t nlml=obj_prt->get_nlml_wrt_parameters();
272 
273  SGVector<float64_t> sg_gradient(gradient, dim, false);
274  obj_prt->get_gradient_of_nlml_wrt_parameters(sg_gradient);
275 
276  return nlml;
277 }
278 
280 {
283  return lik;
284 }
285 
287 {
291 }
292 
294  int m,
295  int max_linesearch,
296  int linesearch,
297  int max_iterations,
299  int past,
301  float64_t min_step,
302  float64_t max_step,
303  float64_t ftol,
304  float64_t wolfe,
305  float64_t gtol,
306  float64_t xtol,
307  float64_t orthantwise_c,
308  int orthantwise_start,
309  int orthantwise_end)
310 {
311  m_m = m;
312  m_max_linesearch = max_linesearch;
313  m_linesearch = linesearch;
314  m_max_iterations = max_iterations;
315  m_delta = delta;
316  m_past = past;
317  m_epsilon = epsilon;
318  m_min_step = min_step;
319  m_max_step = max_step;
320  m_ftol = ftol;
321  m_wolfe = wolfe;
322  m_gtol = gtol;
323  m_xtol = xtol;
324  m_orthantwise_c = orthantwise_c;
325  m_orthantwise_start = orthantwise_start;
326  m_orthantwise_end = orthantwise_end;
327 }
328 
330 {
332  update();
333 
335 }
336 
338 {
341  return SGVector<float64_t> ();
342 
343  //%lp_dhyp = likKL(v,lik,hyp.lik,y,K*post.alpha+m,[],[],j);
345  Map<VectorXd> eigen_lp_dhyp(lp_dhyp.vector, lp_dhyp.vlen);
346  SGVector<float64_t> result(1);
347  //%dnlZ.lik(j) = -sum(lp_dhyp);
348  result[0]=-eigen_lp_dhyp.sum();
349 
350  return result;
351 }
352 
354 {
355  // create eigen representation of K, Z, dfhat and alpha
356  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen/2);
357 
358  REQUIRE(param, "Param not set\n");
359  SGVector<float64_t> result;
360  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
361  result=SGVector<float64_t>(len);
362 
363  for (index_t i=0; i<result.vlen; i++)
364  {
366 
367  //%dm_t = feval(mean{:}, hyp.mean, x, j);
368  if (result.vlen==1)
370  else
372 
373  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
374 
375  //%dnlZ.mean(j) = -alpha'*dm_t;
376  result[i]=-eigen_alpha.dot(eigen_dmu);
377  }
378 
379  return result;
380 }
381 
383 {
384  lbfgs_parameter_t lbfgs_param;
385  lbfgs_param.m = m_m;
386  lbfgs_param.max_linesearch = m_max_linesearch;
387  lbfgs_param.linesearch = m_linesearch;
388  lbfgs_param.max_iterations = m_max_iterations;
389  lbfgs_param.delta = m_delta;
390  lbfgs_param.past = m_past;
391  lbfgs_param.epsilon = m_epsilon;
392  lbfgs_param.min_step = m_min_step;
393  lbfgs_param.max_step = m_max_step;
394  lbfgs_param.ftol = m_ftol;
395  lbfgs_param.wolfe = m_wolfe;
396  lbfgs_param.gtol = m_gtol;
397  lbfgs_param.xtol = m_xtol;
398  lbfgs_param.orthantwise_c = m_orthantwise_c;
400  lbfgs_param.orthantwise_end = m_orthantwise_end;
401 
402  float64_t nlml_opt=0;
403  void * obj_prt = static_cast<void *>(this);
404  lbfgs(m_alpha.vlen, m_alpha.vector, &nlml_opt,
405  CKLInferenceMethod::evaluate,
406  NULL, obj_prt, &lbfgs_param);
407 
408  return nlml_opt;
409 }
410 
412 {
413  REQUIRE(!strcmp(param->m_name, "log_scale"), "Can't compute derivative of "
414  "the nagative log marginal likelihood wrt %s.%s parameter\n",
415  get_name(), param->m_name)
416 
418 
419  SGVector<float64_t> result(1);
420 
422  result[0]*=CMath::exp(m_log_scale*2.0)*2.0;
423 
424  return result;
425 }
426 
428 {
429  REQUIRE(param, "Param not set\n");
430  SGVector<float64_t> result;
431  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
432  result=SGVector<float64_t>(len);
433 
434  for (index_t i=0; i<result.vlen; i++)
435  {
437 
438  //dK = feval(covfunc{:},hyper,x,j);
439  if (result.vlen==1)
440  dK=m_kernel->get_parameter_gradient(param);
441  else
442  dK=m_kernel->get_parameter_gradient(param, i);
443 
444  result[i]=get_derivative_related_cov(dK);
445  result[i]*=CMath::exp(m_log_scale*2.0);
446  }
447 
448  return result;
449 }
450 
452 {
454  update();
455 
456  return SGMatrix<float64_t>(m_L);
457 }
458 
459 } /* namespace shogun */
460 
461 #endif /* HAVE_EIGEN3 */
virtual void set_lbfgs_parameters(int m=100, int max_linesearch=1000, int linesearch=LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE, int max_iterations=1000, float64_t delta=0.0, int past=0, float64_t epsilon=1e-5, float64_t min_step=1e-20, float64_t max_step=1e+20, float64_t ftol=1e-4, float64_t wolfe=0.9, float64_t gtol=0.9, float64_t xtol=1e-16, float64_t orthantwise_c=0.0, int orthantwise_start=0, int orthantwise_end=1)
virtual const char * get_name() const =0
virtual void set_model(CLikelihoodModel *mod)
virtual bool set_variational_distribution(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab)
virtual void update_parameter_hash()
Definition: SGObject.cpp:250
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
virtual void update_alpha()=0
SGVector< float64_t > m_alpha
virtual SGMatrix< float64_t > get_cholesky()
virtual SGVector< float64_t > get_first_derivative_wrt_hyperparameter(const TParameter *param) const =0
int32_t lbfgs(int32_t n, float64_t *x, float64_t *ptr_fx, lbfgs_evaluate_t proc_evaluate, lbfgs_progress_t proc_progress, void *instance, lbfgs_parameter_t *_param, lbfgs_adjust_step_t proc_adjust_step)
Definition: lbfgs.cpp:208
The Inference Method base class.
virtual void set_exp_factor(float64_t exp_factor)
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
The variational Gaussian Likelihood base class. The variational distribution is Gaussian.
float64_t orthantwise_c
Definition: lbfgs.h:311
Definition: SGMatrix.h:20
parameter struct
virtual void update_approx_cov()=0
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
index_t num_cols
Definition: SGMatrix.h:378
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
virtual bool lbfgs_precompute()=0
An abstract class of the mean function.
Definition: MeanFunction.h:49
virtual const char * get_name() const
index_t num_rows
Definition: SGMatrix.h:376
static const float64_t epsilon
Definition: libbmrm.cpp:25
SGMatrix< float64_t > m_Sigma
index_t vlen
Definition: SGVector.h:494
SGMatrix< float64_t > m_L
virtual float64_t get_negative_log_marginal_likelihood()
virtual void set_min_coeff_kernel(float64_t min_coeff_kernel)
virtual void check_variational_likelihood(CLikelihoodModel *mod) const
virtual float64_t lbfgs_optimization()
double float64_t
Definition: common.h:50
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
virtual void set_max_attempt(index_t max_attempt)
virtual SGVector< float64_t > get_posterior_mean()
virtual float64_t get_derivative_related_cov(SGMatrix< float64_t > dK)=0
virtual SGMatrix< float64_t > get_posterior_covariance()
virtual Eigen::LDLT< Eigen::MatrixXd, 0x1 > update_init_helper()
The KL approximation inference method class.
virtual float64_t get_nlml_wrt_parameters()
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:73
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual void set_noise_factor(float64_t noise_factor)
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual void update_chol()=0
static float64_t exp(float64_t x)
Definition: Math.h:621
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:850
The Variational Likelihood base class.
SGVector< float64_t > m_mu
SGVector< float64_t > m_s2
The Kernel base class.
Definition: Kernel.h:158
virtual void get_gradient_of_nlml_wrt_parameters(SGVector< float64_t > gradient)=0
virtual CVariationalGaussianLikelihood * get_variational_likelihood() const
#define SG_ADD(...)
Definition: SGObject.h:81
virtual bool supports_derivative_wrt_hyperparameter() const =0
virtual void update_deriv()=0
virtual void set_model(CLikelihoodModel *mod)
#define delta
Definition: sfa.cpp:23
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:264
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
virtual float64_t get_negative_log_marginal_likelihood_helper()=0
static const float64_t NOT_A_NUMBER
not a number
Definition: Math.h:2046
CLikelihoodModel * m_model

SHOGUN Machine Learning Toolbox - Documentation