SHOGUN  6.1.3
KLCovarianceInferenceMethod.cpp
Go to the documentation of this file.
1  /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  * Code adapted from
31  * http://hannes.nickisch.org/code/approxXX.tar.gz
32  * and Gaussian Process Machine Learning Toolbox
33  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
34  * and the reference paper is
35  * Nickisch, Hannes, and Carl Edward Rasmussen.
36  * "Approximations for Binary Gaussian Process Classification."
37  * Journal of Machine Learning Research 9.10 (2008).
38  *
39  * This code specifically adapted from function in approxKL.m and infKL.m
40  */
41 
43 
48 
49 using namespace Eigen;
50 
51 namespace shogun
52 {
53 
54 CKLCovarianceInferenceMethod::CKLCovarianceInferenceMethod() : CKLInference()
55 {
56  init();
57 }
58 
60  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
61  : CKLInference(kern, feat, m, lab, mod)
62 {
63  init();
64 }
65 
66 void CKLCovarianceInferenceMethod::init()
67 {
68  SG_ADD(&m_V, "V",
69  "V is L'*V=diag(sW)*K",
71  SG_ADD(&m_A, "A",
72  "A is A=I-K*diag(sW)*inv(L)'*inv(L)*diag(sW)",
74  SG_ADD(&m_W, "W",
75  "noise matrix W",
77  SG_ADD(&m_sW, "sW",
78  "Square root of noise matrix W",
80  SG_ADD(&m_dv, "dv",
81  "the gradient of the variational expection wrt sigma2",
83  SG_ADD(&m_df, "df",
84  "the gradient of the variational expection wrt mu",
86 }
87 
88 
90 {
99  update();
100 
101  index_t len=m_alpha.vlen/2;
102  SGVector<float64_t> result(len);
103 
104  Map<VectorXd> eigen_result(result.vector, len);
105  Map<VectorXd> eigen_alpha(m_alpha.vector, len);
106 
107  eigen_result=eigen_alpha;
108 
109  return result;
110 }
111 
113 {
114 }
115 
117  CInference* inference)
118 {
119  if (inference==NULL)
120  return NULL;
121 
122  if (inference->get_inference_type()!=INF_KL_COVARIANCE)
123  SG_SERROR("Provided inference is not of type CKLCovarianceInferenceMethod!\n")
124 
125  SG_REF(inference);
126  return (CKLCovarianceInferenceMethod*)inference;
127 }
128 
130 {
132  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
133 
135 
136  index_t len=m_alpha.vlen/2;
137  //construct mu
138  Map<VectorXd> eigen_alpha(m_alpha.vector, len);
139 
140  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
141  //mu=K*alpha+m
142  eigen_mu=eigen_K*CMath::exp(m_log_scale*2.0)*eigen_alpha+eigen_mean;
143 
144  //construct s2
145  Map<VectorXd> eigen_log_neg_lambda(m_alpha.vector+len, len);
146 
147  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
148  eigen_W=(2.0*eigen_log_neg_lambda.array().exp()).matrix();
149 
150  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
151  eigen_sW=eigen_W.array().sqrt().matrix();
152 
155 
156  //solve L'*V=diag(sW)*K
157  Map<MatrixXd> eigen_V(m_V.matrix, m_V.num_rows, m_V.num_cols);
158  eigen_V=eigen_L.triangularView<Upper>().adjoint().solve(eigen_sW.asDiagonal()*eigen_K*CMath::exp(m_log_scale*2.0));
159  Map<VectorXd> eigen_s2(m_s2.vector, m_s2.vlen);
160  //Sigma=inv(inv(K)-2*diag(lambda))=K-K*diag(sW)*inv(L)'*inv(L)*diag(sW)*K
161  //v=abs(diag(Sigma))
162  eigen_s2=(eigen_K.diagonal().array()*CMath::exp(m_log_scale*2.0)-(eigen_V.array().pow(2).colwise().sum().transpose())).abs().matrix();
163 
165  bool status = lik->set_variational_distribution(m_mu, m_s2, m_labels);
166  return status;
167 }
168 
170 {
171  REQUIRE(gradient.vlen==m_alpha.vlen,
172  "The length of gradients (%d) should the same as the length of parameters (%d)\n",
173  gradient.vlen, m_alpha.vlen);
174 
176  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
177  Map<MatrixXd> eigen_V(m_V.matrix, m_V.num_rows, m_V.num_cols);
179  Map<VectorXd> eigen_s2(m_s2.vector, m_s2.vlen);
180 
181  index_t len=m_alpha.vlen/2;
182  Map<VectorXd> eigen_alpha(m_alpha.vector, len);
183  Map<VectorXd> eigen_log_neg_lambda(m_alpha.vector+len, len);
184 
187 
188  //[a,df,dV] = a_related2(mu,s2,y,lik);
189  TParameter* s2_param=lik->m_parameters->get_parameter("sigma2");
190  m_dv=lik->get_variational_first_derivative(s2_param);
191  Map<VectorXd> eigen_dv(m_dv.vector, m_dv.vlen);
192 
193  TParameter* mu_param=lik->m_parameters->get_parameter("mu");
194  m_df=lik->get_variational_first_derivative(mu_param);
195  Map<VectorXd> eigen_df(m_df.vector, m_df.vlen);
196  //U=inv(L')*diag(sW)
197  MatrixXd eigen_U=eigen_L.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sW.asDiagonal()));
198  Map<MatrixXd> eigen_A(m_A.matrix, m_A.num_rows, m_A.num_cols);
199  // A=I-K*diag(sW)*inv(L)*inv(L')*diag(sW)
200  eigen_A=MatrixXd::Identity(len, len)-eigen_V.transpose()*eigen_U;
201 
204 
205  Map<VectorXd> eigen_dnlz_alpha(gradient.vector, len);
206  Map<VectorXd> eigen_dnlz_log_neg_lambda(gradient.vector+len, len);
207 
208  //dlZ_alpha = K*(df-alpha);
209  eigen_dnlz_alpha=eigen_K*CMath::exp(m_log_scale*2.0)*(-eigen_df+eigen_alpha);
210 
211  //dlZ_lambda = 2*(Sigma.*Sigma)*dV +v -sum(Sigma.*A,2); % => fast diag(V*VinvK')
212  //dlZ_log_neg_lambda = dlZ_lambda .* lambda;
213  //dnlZ = -[dlZ_alpha; dlZ_log_neg_lambda];
214  eigen_dnlz_log_neg_lambda=(eigen_Sigma.array().pow(2)*2.0).matrix()*eigen_dv+eigen_s2;
215  eigen_dnlz_log_neg_lambda=eigen_dnlz_log_neg_lambda-(eigen_Sigma.array()*eigen_A.array()).rowwise().sum().matrix();
216  eigen_dnlz_log_neg_lambda=(eigen_log_neg_lambda.array().exp()*eigen_dnlz_log_neg_lambda.array()).matrix();
217 }
218 
219 
221 {
223  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen/2);
224  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
226  //get mean vector and create eigen representation of it
228  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
229 
232 
233  float64_t trace=0;
234  //L_inv=L\eye(n);
235  //trace(L_inv'*L_inv) %V*inv(K)
236  MatrixXd eigen_t=eigen_L.triangularView<Upper>().adjoint().solve(MatrixXd::Identity(eigen_L.rows(),eigen_L.cols()));
237 
238  for(index_t idx=0; idx<eigen_t.rows(); idx++)
239  trace +=(eigen_t.col(idx).array().pow(2)).sum();
240 
241  //nlZ = -a -logdet(V*inv(K))/2 -n/2 +(alpha'*K*alpha)/2 +trace(V*inv(K))/2;
242  float64_t result=-a+eigen_L.diagonal().array().log().sum();
243  result+=0.5*(-eigen_K.rows()+eigen_alpha.dot(eigen_mu-eigen_mean)+trace);
244  return result;
245 }
246 
248 {
249  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
251  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
253  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
255  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen/2);
256  Map<MatrixXd> eigen_A(m_A.matrix, m_A.num_rows, m_A.num_cols);
257 
258  Map<VectorXd> eigen_dv(m_dv.vector, m_dv.vlen);
259  Map<VectorXd> eigen_df(m_df.vector, m_df.vlen);
260 
261  //AdK = A*dK;
262  MatrixXd AdK=eigen_A*eigen_dK;
263 
264  //z = diag(AdK) + sum(A.*AdK,2) - sum(A'.*AdK,1)';
265  VectorXd z=AdK.diagonal()+(eigen_A.array()*AdK.array()).rowwise().sum().matrix()
266  -(eigen_A.transpose().array()*AdK.array()).colwise().sum().transpose().matrix();
267 
268  //dnlZ(j) = alpha'*dK*(alpha/2-df) - z'*dv;
269  return eigen_alpha.dot(eigen_dK*(eigen_alpha/2.0-eigen_df))-z.dot(eigen_dv);
270 }
271 
273 {
274  float64_t nlml_new=0;
275  float64_t nlml_def=0;
276 
278 
279  if (m_alpha.vlen == m_labels->get_num_labels()*2)
280  {
282 
283  float64_t trace=0;
284  LLT<MatrixXd> llt((eigen_K*CMath::exp(m_log_scale*2.0))+
285  MatrixXd::Identity(eigen_K.rows(), eigen_K.cols()));
286  MatrixXd LL=llt.matrixU();
287  MatrixXd tt=LL.triangularView<Upper>().adjoint().solve(MatrixXd::Identity(LL.rows(),LL.cols()));
288 
289  for(index_t idx=0; idx<tt.rows(); idx++)
290  trace+=(tt.col(idx).array().pow(2)).sum();
291 
292  MatrixXd eigen_V=LL.triangularView<Upper>().adjoint().solve(eigen_K*CMath::exp(m_log_scale*2.0));
293  SGVector<float64_t> s2_tmp(m_s2.vlen);
294  Map<VectorXd> eigen_s2(s2_tmp.vector, s2_tmp.vlen);
295  eigen_s2=(eigen_K.diagonal().array()*CMath::exp(m_log_scale*2.0)-(eigen_V.array().pow(2).colwise().sum().transpose())).abs().matrix();
297 
299  lik->set_variational_distribution(mean, s2_tmp, m_labels);
301 
302  nlml_def=-a+LL.diagonal().array().log().sum();
303  nlml_def+=0.5*(-eigen_K.rows()+trace);
304 
305  if (nlml_new<=nlml_def)
307  }
308 
309  if (m_alpha.vlen != m_labels->get_num_labels()*2 || nlml_def<nlml_new)
310  {
311  if(m_alpha.vlen != m_labels->get_num_labels()*2)
313 
314  //init
315  for (index_t i=0; i<m_alpha.vlen; i++)
316  {
317  if (i<m_alpha.vlen/2)
318  m_alpha[i]=0;
319  else
320  m_alpha[i]=CMath::log(0.5);
321  }
322 
323  index_t len=m_alpha.vlen/2;
324  m_W=SGVector<float64_t>(len);
325  m_sW=SGVector<float64_t>(len);
328  m_V=SGMatrix<float64_t>(len, len);
329  m_Sigma=SGMatrix<float64_t>(len, len);
330  m_A=SGMatrix<float64_t>(len, len);
331  }
332 
333  nlml_new=optimization();
334 }
335 
337 {
339  update();
340 
341  return SGVector<float64_t>(m_sW);
342 }
343 
345 {
349 }
350 
352 {
356 }
357 
359 {
365 }
366 
367 } /* namespace shogun */
368 
static CKLCovarianceInferenceMethod * obtain_from_generic(CInference *inference)
float64_t m_log_scale
Definition: Inference.h:485
virtual bool set_variational_distribution(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab)
virtual float64_t optimization()
virtual void update()
virtual SGVector< float64_t > get_variational_first_derivative(const TParameter *param) const =0
static SGMatrix< float64_t > get_choleksy(SGVector< float64_t > W, SGVector< float64_t > sW, SGMatrix< float64_t > kernel, float64_t scale)
int32_t index_t
Definition: common.h:72
virtual void get_gradient_of_nlml_wrt_parameters(SGVector< float64_t > gradient)
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual EInferenceType get_inference_type() const
Definition: Inference.h:104
virtual CVariationalGaussianLikelihood * get_variational_likelihood() const
virtual int32_t get_num_labels() const =0
virtual float64_t get_derivative_related_cov(SGMatrix< float64_t > dK)
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:418
The variational Gaussian Likelihood base class. The variational distribution is Gaussian.
static SGMatrix< float64_t > get_inverse(SGMatrix< float64_t > L, SGMatrix< float64_t > kernel, SGVector< float64_t > sW, SGMatrix< float64_t > V, float64_t scale)
TParameter * get_parameter(int32_t idx)
Definition: SGMatrix.h:25
parameter struct
#define REQUIRE(x,...)
Definition: SGIO.h:181
Parameter * m_parameters
Definition: SGObject.h:609
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:49
std::enable_if<!std::is_same< T, complex128_t >::value, float64_t >::type mean(const Container< T > &a)
#define SG_REF(x)
Definition: SGObject.h:52
T trace(const SGMatrix< T > &A)
CFeatures * m_features
Definition: Inference.h:473
SGMatrix< float64_t > m_ktrtr
Definition: Inference.h:488
CMeanFunction * m_mean
Definition: Inference.h:467
The KL approximation inference method class.
Definition: KLInference.h:75
CLabels * m_labels
Definition: Inference.h:476
double float64_t
Definition: common.h:60
index_t num_rows
Definition: SGMatrix.h:495
SGVector< float64_t > m_mu
Definition: KLInference.h:367
SGMatrix< float64_t > m_L
Definition: Inference.h:482
virtual SGVector< float64_t > get_variational_expection()=0
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
Definition: KLInference.h:52
index_t num_cols
Definition: SGMatrix.h:497
SGMatrix< float64_t > m_Sigma
Definition: KLInference.h:370
The KL approximation inference method class.
SGVector< float64_t > m_s2
Definition: KLInference.h:375
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
The Inference Method base class.
Definition: Inference.h:81
The class Features is the base class of all feature objects.
Definition: Features.h:69
#define SG_SERROR(...)
Definition: SGIO.h:164
static float64_t exp(float64_t x)
Definition: Math.h:551
static float64_t log(float64_t v)
Definition: Math.h:714
virtual SGVector< float64_t > get_diagonal_vector()
The Kernel base class.
#define SG_ADD(...)
Definition: SGObject.h:93
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:296
The Likelihood model base class.
index_t vlen
Definition: SGVector.h:571
SGVector< float64_t > m_alpha
Definition: Inference.h:479

SHOGUN Machine Learning Toolbox - Documentation