SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
SingleFITCLaplacianInferenceMethod.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (W) 2015 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  */
32 
33 #ifdef HAVE_EIGEN3
34 
37 #include <shogun/lib/external/brent.h>
40 
41 using namespace shogun;
42 using namespace Eigen;
43 
44 namespace shogun
45 {
46 
47 #ifndef DOXYGEN_SHOULD_SKIP_THIS
48 
50 class CFITCPsiLine : public func_base
51 {
52 public:
53  float64_t log_scale;
54  VectorXd dalpha;
55  VectorXd start_alpha;
56  SGVector<float64_t>* alpha;
61  CLikelihoodModel* lik;
62  CLabels* lab;
64 
65  virtual double operator() (double x)
66  {
67  //time complexity O(m*n)
68  Map<VectorXd> eigen_f(f->vector, f->vlen);
69  Map<VectorXd> eigen_m(m->vector, m->vlen);
70  Map<VectorXd> eigen_alpha(alpha->vector, alpha->vlen);
71 
72  //alpha = alpha + s*dalpha;
73  eigen_alpha=start_alpha+x*dalpha;
74  SGVector<float64_t> tmp=inf->compute_mvmK(*alpha);
75  Map<VectorXd> eigen_tmp(tmp.vector, tmp.vlen);
76  //f = mvmK(alpha,V,d0)+m;
77  eigen_f=eigen_tmp+eigen_m;
78 
79  // get first and second derivatives of log likelihood
80  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
81 
82  (*W)=lik->get_log_probability_derivative_f(lab, (*f), 2);
83  W->scale(-1.0);
84 
85  // compute psi=alpha'*(f-m)/2-lp
86  float64_t result = eigen_alpha.dot(eigen_f-eigen_m)/2.0-
88 
89  return result;
90  }
91 };
92 
93 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
94 
96 {
97  init();
98 }
99 
101  CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod, CFeatures* lat)
102  : CSingleFITCLaplacianBase(kern, feat, m, lab, mod, lat)
103 {
104  init();
105 }
106 
107 void CSingleFITCLaplacianInferenceMethod::init()
108 {
109  m_iter=20;
110  m_tolerance=1e-6;
111  m_opt_tolerance=1e-10;
112  m_opt_max=10;
113  m_Psi=0;
114  m_Wneg=false;
115 
116  SG_ADD(&m_dlp, "dlp", "derivative of log likelihood with respect to function location", MS_NOT_AVAILABLE);
117  SG_ADD(&m_W, "W", "the noise matrix", MS_NOT_AVAILABLE);
118  SG_ADD(&m_tolerance, "tolerance", "amount of tolerance for Newton's iterations", MS_NOT_AVAILABLE);
119  SG_ADD(&m_iter, "iter", "max Newton's iterations", MS_NOT_AVAILABLE);
120  SG_ADD(&m_opt_tolerance, "opt_tolerance", "amount of tolerance for Brent's minimization method", MS_NOT_AVAILABLE);
121  SG_ADD(&m_opt_max, "opt_max", "max iterations for Brent's minimization method", MS_NOT_AVAILABLE);
122  SG_ADD(&m_sW, "sW", "square root of W", MS_NOT_AVAILABLE);
123  SG_ADD(&m_d2lp, "d2lp", "second derivative of log likelihood with respect to function location", MS_NOT_AVAILABLE);
124  SG_ADD(&m_d3lp, "d3lp", "third derivative of log likelihood with respect to function location", MS_NOT_AVAILABLE);
125  SG_ADD(&m_chol_R0, "chol_R0", "Cholesky of inverse covariance of inducing features", MS_NOT_AVAILABLE);
126  SG_ADD(&m_dfhat, "dfhat", "derivative of negative log (approximated) marginal likelihood wrt f", MS_NOT_AVAILABLE);
127  SG_ADD(&m_g, "g", "variable g defined in infFITC_Laplace.m", MS_NOT_AVAILABLE);
128  SG_ADD(&m_dg, "dg", "variable d0 defined in infFITC_Laplace.m", MS_NOT_AVAILABLE);
129  SG_ADD(&m_Psi, "Psi", "the negative log likelihood without constant terms used in Newton's method", MS_NOT_AVAILABLE);
130  SG_ADD(&m_Wneg, "Wneg", "whether W contains negative elements", MS_NOT_AVAILABLE);
131 }
132 
134 {
136 
137  if (!m_gradient_update)
138  {
140  update_deriv();
141  m_gradient_update=true;
143  }
144 }
145 
147 {
148  SG_DEBUG("entering\n");
149 
151  update_init();
152  update_alpha();
153  update_chol();
154  m_gradient_update=false;
156 
157  SG_DEBUG("leaving\n");
158 }
159 
161 {
163  update();
164 
165  return SGVector<float64_t>(m_sW);
166 }
167 
169 {
170 }
171 
173 {
174  //time complexity O(m*n)
176  Map<VectorXd> eigen_t(m_t.vector, m_t.vlen);
177  Map<VectorXd> eigen_x(x.vector, x.vlen);
178 
179  SGVector<float64_t> res(x.vlen);
180  Map<VectorXd> eigen_res(res.vector, res.vlen);
181 
182  //Zx = t.*x - RVdd'*(RVdd*x);
183  eigen_res=eigen_x.cwiseProduct(eigen_t)-eigen_Rvdd.transpose()*(eigen_Rvdd*eigen_x);
184  return res;
185 }
186 
188 {
189  //time complexity O(m*n)
191  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
192  Map<VectorXd> eigen_al(al.vector, al.vlen);
193 
194  SGVector<float64_t> res(al.vlen);
195  Map<VectorXd> eigen_res(res.vector, res.vlen);
196 
197  //Kal = V'*(V*al) + d0.*al;
198  eigen_res= eigen_V.transpose()*(eigen_V*eigen_al)+eigen_dg.cwiseProduct(eigen_al);
199  return res;
200 }
201 
203  CInferenceMethod* inference)
204 {
205  REQUIRE(inference!=NULL, "Inference should be not NULL");
206 
208  SG_SERROR("Provided inference is not of type CSingleFITCLaplacianInferenceMethod!\n")
209 
210  SG_REF(inference);
211  return (CSingleFITCLaplacianInferenceMethod*)inference;
212 }
213 
215 {
216  //time complexity O(m^3), where mtx is a m-by-m matrix
217  REQUIRE(mtx.num_rows==mtx.num_cols, "Matrix must be square\n");
218 
219  Map<MatrixXd> eigen_mtx(mtx.matrix, mtx.num_rows, mtx.num_cols);
220  LLT<MatrixXd> chol(eigen_mtx.colwise().reverse().rowwise().reverse().matrix());
221  //tmp=chol(rot180(A))'
222  MatrixXd tmp=chol.matrixL();
223  SGMatrix<float64_t> res(mtx.num_rows, mtx.num_cols);
224  Map<MatrixXd> eigen_res(res.matrix, res.num_rows, res.num_cols);
225  //chol_inv = @(A) rot180(chol(rot180(A))')\eye(nu); % chol(inv(A))
226  eigen_res=tmp.colwise().reverse().rowwise().reverse().matrix().triangularView<Upper>(
227  ).solve(MatrixXd::Identity(mtx.num_rows, mtx.num_cols));
228  return res;
229 }
230 
232 {
234  update();
235 
236  if (m_Wneg)
237  {
238  SG_WARNING("nlZ cannot be computed since W is too negative");
239  //nlZ = NaN;
240  return CMath::INFTY;
241  }
242  //time complexity O(m^2*n)
243  Map<VectorXd> eigen_alpha(m_al.vector, m_al.vlen);
244  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
246  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
247  // get log likelihood
249  m_mu));
250 
252  Map<VectorXd>eigen_t(m_t.vector, m_t.vlen);
253  MatrixXd A=eigen_V*eigen_t.asDiagonal()*eigen_V.transpose()+MatrixXd::Identity(m_V.num_rows,m_V.num_rows);
254  LLT<MatrixXd> chol(A);
255  A=chol.matrixU();
256 
257  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
258  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
259 
260  //nlZ = alpha'*(f-m)/2 - sum(lp) - sum(log(dd))/2 + sum(log(diag(chol(A))));
261  float64_t result=eigen_alpha.dot(eigen_mu-eigen_mean)/2.0-lp+
262  A.diagonal().array().log().sum()+(eigen_W.cwiseProduct(eigen_dg)+MatrixXd::Ones(eigen_dg.rows(),1)).array().log().sum()/2.0;
263 
264  return result;
265 }
266 
268 {
269 }
270 
272 {
273  //time complexity O(m^2*n)
274  //m-by-m matrix
276  //m-by-n matrix
278 
280 
282  Map<MatrixXd> eigen_cor_kuu(cor_kuu.matrix, cor_kuu.num_rows, cor_kuu.num_cols);
283  eigen_cor_kuu=eigen_kuu*CMath::exp(m_log_scale*2.0)+CMath::exp(m_log_ind_noise)*MatrixXd::Identity(
285  //R0 = chol_inv(Kuu+snu2*eye(nu)); m-by-m matrix
286  m_chol_R0=get_chol_inv(cor_kuu);
288 
289  //V = R0*Ku; m-by-n matrix
292 
293  eigen_V=eigen_R0*(eigen_ktru*CMath::exp(m_log_scale*2.0));
295  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
296  //d0 = diagK-sum(V.*V,1)';
297  eigen_dg=eigen_ktrtr_diag*CMath::exp(m_log_scale*2.0)-(eigen_V.cwiseProduct(eigen_V)).colwise().sum().adjoint();
298 
299  // get mean vector and create eigen representation of it
301  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
302 
303  // create shogun and eigen representation of function vector
305  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
306 
307  float64_t Psi_New;
308  float64_t Psi_Def;
310  {
311  // set alpha a zero vector
313  m_al.zero();
314 
315  // f = mean, if length of alpha and length of y doesn't match
316  eigen_mu=eigen_mean;
317 
319  m_labels, m_mu));
320  }
321  else
322  {
323  Map<VectorXd> eigen_alpha(m_al.vector, m_al.vlen);
324 
325  // compute f = K * alpha + m
327  Map<VectorXd> eigen_tmp(tmp.vector, tmp.vlen);
328  eigen_mu=eigen_tmp+eigen_mean;
329 
330  Psi_New=eigen_alpha.dot(eigen_tmp)/2.0-
332 
334 
335  // if default is better, then use it
336  if (Psi_Def < Psi_New)
337  {
338  m_al.zero();
339  eigen_mu=eigen_mean;
340  Psi_New=Psi_Def;
341  }
342  }
343  m_Psi=Psi_New;
344 }
345 
347 {
348  //time complexity O(m^2*n)
351  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
353  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
354 
356  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
357 
358  float64_t Psi_Old=CMath::INFTY;
359  float64_t Psi_New=m_Psi;
360 
361  // compute W = -d2lp
363  m_W.scale(-1.0);
364 
365  //n-by-1 vector
366  Map<VectorXd> eigen_al(m_al.vector, m_al.vlen);
367 
368  // get first derivative of log probability function
370 
371  index_t iter=0;
372 
373  m_Wneg=false;
374  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
375  {
376  //time complexity O(m^2*n)
377  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
378  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
379 
380  Psi_Old = Psi_New;
381  iter++;
382 
383  if (eigen_W.minCoeff() < 0)
384  {
385  // Suggested by Vanhatalo et. al.,
386  // Gaussian Process Regression with Student's t likelihood, NIPS 2009
387  // Quoted from infFITC_Laplace.m
388  float64_t df;
389 
391  {
393  df=lik->get_degrees_freedom();
394  SG_UNREF(lik);
395  }
396  else
397  df=1;
398  eigen_W+=(2.0/(df+1))*eigen_dlp.cwiseProduct(eigen_dlp);
399  }
400 
401  //b = W.*(f-m) + dlp;
402  VectorXd b=eigen_W.cwiseProduct(eigen_mu-eigen_mean)+eigen_dlp;
403 
404  //dd = 1./(1+W.*d0);
405  VectorXd dd=MatrixXd::Ones(b.rows(),1).cwiseQuotient(eigen_W.cwiseProduct(eigen_dg)+MatrixXd::Ones(b.rows(),1));
406 
407  VectorXd eigen_t=eigen_W.cwiseProduct(dd);
408  //m-by-m matrix
410  Map<MatrixXd> eigen_tmp(tmp.matrix, tmp.num_rows, tmp.num_cols);
411  //eye(nu)+(V.*repmat((W.*dd)',nu,1))*V'
412  eigen_tmp=eigen_V*eigen_t.asDiagonal()*eigen_V.transpose()+MatrixXd::Identity(tmp.num_rows,tmp.num_rows);
413  tmp=get_chol_inv(tmp);
414  //chol_inv(eye(nu)+(V.*repmat((W.*dd)',nu,1))*V')
415  Map<MatrixXd> eigen_tmp2(tmp.matrix, tmp.num_rows, tmp.num_cols);
416  //RV = chol_inv(eye(nu)+(V.*repmat((W.*dd)',nu,1))*V')*V;
417  // m-by-n matrix
418  MatrixXd eigen_RV=eigen_tmp2*eigen_V;
419  //dalpha = dd.*b - (W.*dd).*(RV'*(RV*(dd.*b))) - alpha; % Newt dir + line search
420  VectorXd dalpha=dd.cwiseProduct(b)-eigen_t.cwiseProduct(eigen_RV.transpose()*(eigen_RV*(dd.cwiseProduct(b))))-eigen_al;
421 
422  //perform Brent's optimization
423  CFITCPsiLine func;
424 
425  func.log_scale=m_log_scale;
426  func.dalpha=dalpha;
427  func.start_alpha=eigen_al;
428  func.alpha=&m_al;
429  func.dlp=&m_dlp;
430  func.f=&m_mu;
431  func.m=&mean;
432  func.W=&m_W;
433  func.lik=m_model;
434  func.lab=m_labels;
435  func.inf=this;
436 
437  float64_t x;
438  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
439  }
440 
441  if (Psi_Old-Psi_New>m_tolerance && iter>=m_iter)
442  {
443  SG_WARNING("Max iterations (%d) reached, but convergence level (%f) is not yet below tolerance (%f)\n", m_iter, Psi_Old-Psi_New, m_tolerance);
444  }
445 
446  // compute f = K * alpha + m
448  Map<VectorXd> eigen_tmp(tmp.vector, tmp.vlen);
449  eigen_mu=eigen_tmp+eigen_mean;
450 
452  Map<VectorXd> eigen_post_alpha(m_alpha.vector, m_alpha.vlen);
453  //post.alpha = R0'*(V*alpha);
454  //m-by-1 vector
455  eigen_post_alpha=eigen_R0.transpose()*(eigen_V*eigen_al);
456 }
457 
459 {
460  //time complexity O(m^2*n)
461  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
464 
465  // get log probability derivatives
469 
470  // W = -d2lp
471  m_W=m_d2lp.clone();
472  m_W.scale(-1.0);
473 
474  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
476  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
477 
478  VectorXd Wd0_1=eigen_W.cwiseProduct(eigen_dg)+MatrixXd::Ones(eigen_W.rows(),1);
479 
480  // compute sW
481  // post.sW = sqrt(abs(W)).*sign(W); % preserve sign in case of negative
482  if (eigen_W.minCoeff()>0)
483  {
484  eigen_sW=eigen_W.cwiseSqrt();
485  }
486  else
487  {
488  eigen_sW=((eigen_W.array().abs()+eigen_W.array())/2).sqrt()-((eigen_W.array().abs()-eigen_W.array())/2).sqrt();
489  //any(1+d0.*W<0)
490  if (!(Wd0_1.array().abs().matrix()==Wd0_1))
491  m_Wneg=true;
492  }
493 
495  Map<VectorXd>eigen_t(m_t.vector, m_t.vlen);
496 
497  //dd = 1./(1+d0.*W);
498  VectorXd dd=MatrixXd::Ones(Wd0_1.rows(),1).cwiseQuotient(Wd0_1);
499  eigen_t=eigen_W.cwiseProduct(dd);
500 
501  //m-by-m matrix
503  Map<MatrixXd> eigen_A(A.matrix, A.num_rows, A.num_cols);
504  //A = eye(nu)+(V.*repmat((W.*dd)',nu,1))*V';
505  eigen_A=eigen_V*eigen_t.asDiagonal()*eigen_V.transpose()+MatrixXd::Identity(A.num_rows,A.num_rows);
506 
507  //R0tV = R0'*V; m-by-n
508  MatrixXd R0tV=eigen_R0.transpose()*eigen_V;
509 
510  //B = R0tV.*repmat((W.*dd)',nu,1); m-by-n matrix
511  MatrixXd B=R0tV*eigen_t.asDiagonal();
512 
513  //m-by-m matrix
516 
517  //post.L = -B*R0tV';
518  eigen_L=-B*R0tV.transpose();
519 
521  Map<MatrixXd> eigen_tmp(tmp.matrix, tmp.num_rows, tmp.num_cols);
522  //RV = chol_inv(A)*V; m-by-n matrix
523  MatrixXd eigen_RV=eigen_tmp*eigen_V;
524  //RVdd m-by-n matrix
527  //RVdd = RV.*repmat((W.*dd)',nu,1);
528  eigen_Rvdd=eigen_RV*eigen_t.asDiagonal();
529 
530  if (!m_Wneg)
531  {
532  //B = B*RV';
533  B=B*eigen_RV.transpose();
534  //post.L = post.L + B*B';
535  eigen_L+=B*B.transpose();
536  }
537  else
538  {
539  //B = B*V';
540  B=B*eigen_V.transpose();
541  //post.L = post.L + (B*inv(A))*B';
542  FullPivLU<MatrixXd> lu(eigen_A);
543  eigen_L+=B*lu.inverse()*B.transpose();
544  }
545 
548  Map<VectorXd> eigen_g(m_g.vector, m_g.vlen);
549  //g = d/2 + sum(((R*R0)*P).^2,1)'/2
550  eigen_g=((eigen_dg.cwiseProduct(dd)).array()+
551  ((eigen_tmp*eigen_R0)*(eigen_ktru*CMath::exp(m_log_scale*2.0))*dd.asDiagonal()
552  ).array().pow(2).colwise().sum().transpose())/2;
553 }
554 
556 {
557  //time complexity O(m^2*n)
560  Map<VectorXd> eigen_al(m_al.vector, m_al.vlen);
562  // create shogun and eigen representation of B
563  // m-by-n matrix
566 
567  //B = (R0'*R0)*Ku
568  eigen_B=eigen_R0.transpose()*eigen_V;
569 
570  // create shogun and eigen representation of w
572  //w = B*al;
573  Map<VectorXd> eigen_w(m_w.vector, m_w.vlen);
574  eigen_w=eigen_B*eigen_al;
575 
576  // create shogun and eigen representation of the vector dfhat
577  Map<VectorXd> eigen_d3lp(m_d3lp.vector, m_d3lp.vlen);
578  Map<VectorXd> eigen_g(m_g.vector, m_g.vlen);
580  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
581 
582  // compute derivative of nlZ wrt fhat
583  // dfhat = g.*d3lp;
584  eigen_dfhat=eigen_g.cwiseProduct(eigen_d3lp);
585 }
586 
589 {
590  //time complexity O(m^2*n)
592  Map<VectorXd> eigen_ddiagKi(ddiagKi.vector, ddiagKi.vlen);
593  //m-by-m matrix
594  Map<MatrixXd> eigen_dKuui(dKuui.matrix, dKuui.num_rows, dKuui.num_cols);
595  //m-by-n matrix
596  Map<MatrixXd> eigen_dKui(dKui.matrix, dKui.num_rows, dKui.num_cols);
597 
598  // compute R=2*dKui-dKuui*B
599  SGMatrix<float64_t> dA(dKui.num_rows, dKui.num_cols);
600  Map<MatrixXd> eigen_dA(dA.matrix, dA.num_rows, dA.num_cols);
601  //dA = 2*dKu'-R0tV'*dKuu;
602  //dA' = 2*dKu-dKuu'*R0tV;
603  eigen_dA=2*eigen_dKui-eigen_dKuui*eigen_R0tV;
604 
605  SGVector<float64_t> v(ddiagKi.vlen);
606  Map<VectorXd> eigen_v(v.vector, v.vlen);
607  //w = sum(dA.*R0tV',2);
608  //w' = sum(dA'.*R0tV,1);
609  //v = ddiagK-w;
610  eigen_v=eigen_ddiagKi-eigen_dA.cwiseProduct(eigen_R0tV).colwise().sum().transpose();
611 
612  //explicit term
613  float64_t result=CSingleFITCLaplacianBase::get_derivative_related_cov(ddiagKi, dKuui, dKui, v, dA);
614 
615  //implicit term
616  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
617  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
618 
620  Map<VectorXd> eigen_b(b.vector, b.vlen);
621  //b = dA*(R0tV*dlp) + v.*dlp;
622  eigen_b=eigen_dA.transpose()*(eigen_R0tV*eigen_dlp)+eigen_v.cwiseProduct(eigen_dlp);
623  //KZb = mvmK(mvmZ(b,RVdd,t),V,d0);
625  Map<VectorXd> eigen_KZb(KZb.vector, KZb.vlen);
626  //dnlZ.cov(i) = dnlZ.cov(i) - dfhat'*( b-KZb );
627  result-=eigen_dfhat.dot(eigen_b-eigen_KZb);
628  return result;
629 }
630 
632  const TParameter* param)
633 {
634  REQUIRE(param, "Param not set\n");
635  //time complexity O(m^2*n)
636  REQUIRE(!(strcmp(param->m_name, "log_scale")
637  && strcmp(param->m_name, "log_inducing_noise")
638  && strcmp(param->m_name, "inducing_features")),
639  "Can't compute derivative of"
640  " the nagative log marginal likelihood wrt %s.%s parameter\n",
641  get_name(), param->m_name)
642 
643  SGVector<float64_t> result;
644  int32_t len;
645  if (!strcmp(param->m_name, "inducing_features"))
646  {
647  if(m_Wneg)
648  {
649  int32_t dim=m_inducing_features.num_rows;
650  int32_t num_samples=m_inducing_features.num_cols;
651  len=dim*num_samples;
652  }
653  else if (!m_fully_sparse)
655  else
657  }
658  else
659  len=1;
660 
661  if (m_Wneg)
662  {
663  result=SGVector<float64_t>(len);
664  return derivative_helper_when_Wneg(result, param);
665  }
666 
667  if (!strcmp(param->m_name, "log_inducing_noise"))
668  // wrt inducing_noise
669  // compute derivative wrt inducing noise
670  return get_derivative_wrt_inducing_noise(param);
671 
672  result=SGVector<float64_t>(len);
673  // wrt scale
674  // clone kernel matrices
676  SGMatrix<float64_t> deriv_uu=m_kuu.clone();
677  SGMatrix<float64_t> deriv_tru=m_ktru.clone();
678 
679  // create eigen representation of kernel matrices
680  Map<VectorXd> ddiagKi(deriv_trtr.vector, deriv_trtr.vlen);
681  Map<MatrixXd> dKuui(deriv_uu.matrix, deriv_uu.num_rows, deriv_uu.num_cols);
682  Map<MatrixXd> dKui(deriv_tru.matrix, deriv_tru.num_rows, deriv_tru.num_cols);
683 
684  // compute derivatives wrt scale for each kernel matrix
685  result[0]=get_derivative_related_cov(deriv_trtr, deriv_uu, deriv_tru);
686  result[0]*=CMath::exp(m_log_scale*2.0)*2.0;
687  return result;
688 }
689 
691  const TParameter* param)
692 {
693  SGVector<float64_t> result(1);
694  if (m_Wneg)
695  return derivative_helper_when_Wneg(result, param);
696 
697  // get derivatives wrt likelihood model parameters
699  m_mu, param);
701  m_mu, param);
703  m_mu, param);
704 
705  // create eigen representation of the derivatives
706  Map<VectorXd> eigen_lp_dhyp(lp_dhyp.vector, lp_dhyp.vlen);
707  Map<VectorXd> eigen_dlp_dhyp(dlp_dhyp.vector, dlp_dhyp.vlen);
708  Map<VectorXd> eigen_d2lp_dhyp(d2lp_dhyp.vector, d2lp_dhyp.vlen);
709  Map<VectorXd> eigen_g(m_g.vector, m_g.vlen);
710  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
711 
712  //explicit term
713  //dnlZ.lik(i) = -g'*d2lp_dhyp - sum(lp_dhyp);
714  result[0]=-eigen_g.dot(eigen_d2lp_dhyp)-eigen_lp_dhyp.sum();
715 
716  //implicit term
717  //b = mvmK(dlp_dhyp,V,d0);
718  SGVector<float64_t> b=compute_mvmK(dlp_dhyp);
719  //dnlZ.lik(i) = dnlZ.lik(i) - dfhat'*(b-mvmK(mvmZ(b,RVdd,t),V,d0));
721 
722  return result;
723 }
724 
726  const TParameter* param)
727 {
728  REQUIRE(param, "Param not set\n");
729  SGVector<float64_t> result;
730  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
731  result=SGVector<float64_t>(len);
732 
733  if (m_Wneg)
734  return derivative_helper_when_Wneg(result, param);
735 
736  m_lock->lock();
737  CFeatures *inducing_features=get_inducing_features();
738  for (index_t i=0; i<result.vlen; i++)
739  {
740  //time complexity O(m^2*n)
741  SGVector<float64_t> deriv_trtr;
742  SGMatrix<float64_t> deriv_uu;
743  SGMatrix<float64_t> deriv_tru;
744 
746  deriv_trtr=m_kernel->get_parameter_gradient_diagonal(param, i);
747 
748  m_kernel->init(inducing_features, inducing_features);
749  deriv_uu=m_kernel->get_parameter_gradient(param, i);
750 
751  m_kernel->init(inducing_features, m_features);
752  deriv_tru=m_kernel->get_parameter_gradient(param, i);
753 
754  // create eigen representation of derivatives
755  Map<VectorXd> ddiagKi(deriv_trtr.vector, deriv_trtr.vlen);
756  Map<MatrixXd> dKuui(deriv_uu.matrix, deriv_uu.num_rows,
757  deriv_uu.num_cols);
758  Map<MatrixXd> dKui(deriv_tru.matrix, deriv_tru.num_rows,
759  deriv_tru.num_cols);
760 
761  result[i]=get_derivative_related_cov(deriv_trtr, deriv_uu, deriv_tru);
762  result[i]*=CMath::exp(m_log_scale*2.0);
763  }
764  SG_UNREF(inducing_features);
765  m_lock->unlock();
766 
767  return result;
768 }
769 
771 {
772  //time complexity O(m*n)
773  //explicit term
775 
776  //implicit term
777  //Zdm = mvmZ(dm,RVdd,t);
778  //tmp = mvmK(Zdm,V,d0)
779  //dnlZ.mean(i) = dnlZ.mean(i) - dfhat'*(dm-mvmK(Zdm,V,d0));
781 
782  return result;
783 }
784 
786 {
787  //time complexity O(m*n)
788  Map<VectorXd> eigen_d(d.vector, d.vlen);
790  Map<VectorXd> eigen_tmp(tmp.vector, tmp.vlen);
791  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
792  return eigen_dfhat.dot(eigen_d-eigen_tmp);
793 }
794 
796  const TParameter* param)
797 {
798  //time complexity O(m*n)
799  REQUIRE(param, "Param not set\n");
800  SGVector<float64_t> result;
801  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
802  result=SGVector<float64_t>(len);
803 
804  if (m_Wneg)
805  return derivative_helper_when_Wneg(result, param);
806 
807  for (index_t i=0; i<result.vlen; i++)
808  {
811  result[i]=get_derivative_related_mean(dmu);
812  }
813 
814  return result;
815 }
816 
818  SGVector<float64_t> res, const TParameter *param)
819 {
820  REQUIRE(param, "Param not set\n");
821  SG_WARNING("Derivative wrt %s cannot be computed since W (the Hessian (diagonal) matrix) is too negative\n", param->m_name);
822  //dnlZ = struct('cov',0*hyp.cov, 'mean',0*hyp.mean, 'lik',0*hyp.lik);
823  res.zero();
824  return res;
825 }
826 
828  const TParameter* param)
829 {
830  //time complexity depends on the implementation of the provided kernel
831  //time complexity is at least O(max((p*n*m),(m^2*n))), where p is the dimension (#) of features
832  //For an ARD kernel with KL_FULL, the time complexity is O(max((p*n*m*d),(m^2*n)))
833  //where the paramter \f$\Lambda\f$ of the ARD kerenl is a \f$d\f$-by-\f$p\f$ matrix,
834  //For an ARD kernel with KL_SCALE and KL_DIAG, the time complexity is O(max((p*n*m),(m^2*n)))
835  //efficiently compute the implicit term and explicit term at one shot
836  Map<VectorXd> eigen_al(m_al.vector, m_al.vlen);
838  //w=B*al
839  Map<VectorXd> eigen_w(m_w.vector, m_w.vlen);
841  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
842  Map<VectorXd> eigen_dfhat(m_dfhat.vector, m_dfhat.vlen);
843 
844  //q = dfhat - mvmZ(mvmK(dfhat,V,d0),RVdd,t);
846  Map<VectorXd> eigen_q(q.vector, q.vlen);
847  eigen_q=eigen_dfhat-eigen_q;
848 
849  //explicit term
850  //diag_dK = alpha.*alpha + sum(RVdd.*RVdd,1)'-t, where t can be cancelled out
851  //-v_1=get_derivative_related_cov_diagonal= -(alpha.*alpha + sum(RVdd.*RVdd,1)')
852  //implicit term
853  //-v_2=-2*dlp.*q
854  //neg_v = -(diag_dK+ 2*dlp.*q);
856  Map<VectorXd> eigen_neg_v(neg_v.vector, neg_v.vlen);
857  eigen_neg_v-=2*eigen_dlp.cwiseProduct(eigen_q);
858 
860  Map<MatrixXd> eigen_BdK(BdK.matrix, BdK.num_rows, BdK.num_cols);
861  //explicit
862  //BdK = (B*alpha)*alpha' + (B*RVdd')*RVdd - B.*repmat(v_1',nu,1),
863  //implicit
864  //BdK = BdK + (B*dlp)*q' + (B*q)*dlp' - B.*repmat(v_2',nu,1)
865  //where v_1 is the explicit part of v and v_2 is the implicit part of v
866  //v=v_1+v_2
867  eigen_BdK=eigen_B*eigen_neg_v.asDiagonal()+eigen_w*(eigen_al.transpose())+
868  (eigen_B*eigen_Rvdd.transpose())*eigen_Rvdd+
869  (eigen_B*eigen_dlp)*eigen_q.transpose()+(eigen_B*eigen_q)*eigen_dlp.transpose();
870 
871  return get_derivative_related_inducing_features(BdK, param);
872 }
873 
875  const TParameter* param)
876 {
877  //time complexity O(m^2*n)
878  //explicit term
880 
881  //implicit term
883  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
884 
885  //snu = sqrt(snu2);
886  //T = chol_inv(Kuu + snu2*eye(nu)); T = T'*(T*(snu*Ku));
887  //t1 = sum(T.*T,1)';
888  VectorXd eigen_t1=eigen_B.cwiseProduct(eigen_B).colwise().sum().adjoint();
889 
890  //b = (t1.*dlp-T'*(T*dlp))*2;
891  SGVector<float64_t> b(eigen_t1.rows());
892  Map<VectorXd> eigen_b(b.vector, b.vlen);
894  eigen_b=(eigen_t1.cwiseProduct(eigen_dlp)-eigen_B.transpose()*(eigen_B*eigen_dlp))*factor;
895 
896  //KZb = mvmK(mvmZ(b,RVdd,t),V,d0);
897  //z = z - dfhat'*( b-KZb );
899 
900  return result;
901 }
902 
904 {
906 
908  Map<VectorXd> eigen_res(res.vector, res.vlen);
909 
910  /*
911  //true posterior mean with equivalent FITC prior approximated by Newton method
912  //time complexity O(n)
913  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
914  SGVector<float64_t> mean=m_mean->get_mean_vector(m_features);
915  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
916  eigen_res=eigen_mu-eigen_mean;
917  */
918 
919  //FITC (further) approximated posterior mean with Netwon method
920  //time complexity of the following operation is O(m*n)
921  Map<VectorXd> eigen_post_alpha(m_alpha.vector, m_alpha.vlen);
923  eigen_res=CMath::exp(m_log_scale*2.0)*eigen_Ktru.adjoint()*eigen_post_alpha;
924 
925  return res;
926 }
927 
929 {
931  //time complexity of the following operations is O(m*n^2)
932  //Warning: the the time complexity increases from O(m^2*n) to O(n^2*m) if this method is called
935  m_Sigma.num_cols);
936 
937  //FITC (further) approximated posterior covariance with Netwon method
940  Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);
942 
943  MatrixXd diagonal_part=eigen_dg.asDiagonal();
944  //FITC equivalent prior
945  MatrixXd prior=eigen_V.transpose()*eigen_V+diagonal_part;
946 
947  MatrixXd tmp=CMath::exp(m_log_scale*2.0)*eigen_Ktru;
948  eigen_Sigma=prior-tmp.adjoint()*eigen_L*tmp;
949 
950  /*
951  //true posterior mean with equivalent FITC prior approximated by Newton method
952  Map<VectorXd> eigen_t(m_t.vector, m_t.vlen);
953  Map<MatrixXd> eigen_Rvdd(m_Rvdd.matrix, m_Rvdd.num_rows, m_Rvdd.num_cols);
954  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
955  MatrixXd tmp1=eigen_Rvdd*prior;
956  eigen_Sigma=prior+tmp.transpose()*tmp;
957 
958  MatrixXd tmp2=((eigen_dg.cwiseProduct(eigen_t)).asDiagonal()*eigen_V.transpose())*eigen_V;
959  eigen_Sigma-=(tmp2+tmp2.transpose());
960  eigen_Sigma-=eigen_V.transpose()*(eigen_V*eigen_t.asDiagonal()*eigen_V.transpose())*eigen_V;
961  MatrixXd tmp3=((eigen_dg.cwiseProduct(eigen_dg)).cwiseProduct(eigen_t)).asDiagonal();
962  eigen_Sigma-=tmp3;
963  */
964 
966 }
967 
968 }
969 #endif /* HAVE_EIGEN3 */
virtual float64_t get_derivative_related_mean(SGVector< float64_t > dmu)
virtual SGVector< float64_t > get_derivative_wrt_inducing_features(const TParameter *param)
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Kernel.cpp:98
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
virtual SGVector< float64_t > compute_mvmZ(SGVector< float64_t > x)
static CSingleFITCLaplacianInferenceMethod * obtain_from_generic(CInferenceMethod *inference)
virtual void update_parameter_hash()
Definition: SGObject.cpp:248
SGVector< float64_t > m_ktrtr_diag
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
SGVector< float64_t > m_alpha
The Inference Method base class.
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
int32_t index_t
Definition: common.h:62
virtual SGVector< float64_t > derivative_helper_when_Wneg(SGVector< float64_t > res, const TParameter *param)
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:2048
virtual SGVector< float64_t > get_second_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
virtual int32_t get_num_labels() const =0
virtual float64_t get_derivative_related_cov(SGVector< float64_t > ddiagKi, SGMatrix< float64_t > dKuui, SGMatrix< float64_t > dKui)
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
Definition: SGMatrix.h:20
virtual ELikelihoodModelType get_model_type() const
SGMatrix< T > clone()
Definition: SGMatrix.cpp:260
parameter struct
#define REQUIRE(x,...)
Definition: SGIO.h:206
void unlock()
Definition: Lock.cpp:64
index_t num_cols
Definition: SGMatrix.h:378
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
The SingleFITCLaplace approximation inference method class for regression and binary Classification...
float64_t get_degrees_freedom() const
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
An abstract class of the mean function.
Definition: MeanFunction.h:49
virtual SGVector< float64_t > get_derivative_wrt_inducing_noise(const TParameter *param)
void scale(T alpha)
Scale vector inplace.
Definition: SGVector.cpp:843
SGMatrix< float64_t > m_inducing_features
The Fully Independent Conditional Training inference base class for Laplace and regression for 1-D la...
#define SG_REF(x)
Definition: SGObject.h:51
index_t num_rows
Definition: SGMatrix.h:376
virtual SGVector< float64_t > compute_mvmK(SGVector< float64_t > al)
index_t vlen
Definition: SGVector.h:494
SGMatrix< float64_t > m_L
virtual float64_t get_derivative_related_cov(SGVector< float64_t > ddiagKi, SGMatrix< float64_t > dKuui, SGMatrix< float64_t > dKui)
double float64_t
Definition: common.h:50
virtual SGVector< float64_t > get_derivative_related_cov_diagonal()
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:354
virtual SGVector< float64_t > get_derivative_related_inducing_features(SGMatrix< float64_t > BdK, const TParameter *param)
Class that models a Student's-t likelihood.
virtual float64_t get_derivative_related_mean(SGVector< float64_t > dmu)
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:73
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual CFeatures * get_inducing_features()
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
#define SG_SERROR(...)
Definition: SGIO.h:179
static float64_t exp(float64_t x)
Definition: Math.h:621
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:850
SGVector< T > clone() const
Definition: SGVector.cpp:209
virtual EInferenceType get_inference_type() const
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
The Kernel base class.
Definition: Kernel.h:158
virtual SGVector< float64_t > get_third_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
virtual SGVector< float64_t > get_derivative_wrt_inducing_noise(const TParameter *param)
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:81
static CStudentsTLikelihood * obtain_from_generic(CLikelihoodModel *likelihood)
virtual SGVector< float64_t > get_first_derivative(const CLabels *lab, SGVector< float64_t > func, const TParameter *param) const
virtual SGMatrix< float64_t > get_chol_inv(SGMatrix< float64_t > mtx)
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:262
virtual SGVector< float64_t > get_parameter_gradient_diagonal(const TParameter *param, index_t index=-1)
Definition: Kernel.h:864
void lock()
Definition: Lock.cpp:57
The Likelihood model base class.
CLikelihoodModel * m_model
virtual float64_t get_derivative_implicit_term_helper(SGVector< float64_t > d)

SHOGUN 机器学习工具包 - 项目文档