SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LeastAngleRegression.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2012 Chiyuan Zhang
8  * Copyright (C) 2012 Chiyuan Zhang
9  */
10 
11 #include <shogun/lib/config.h>
12 
13 #include <vector>
14 #include <limits>
15 #include <algorithm>
16 
22 
23 using namespace Eigen;
24 using namespace shogun;
25 using namespace std;
26 
27 static vector<float64_t> make_vector(int32_t size, float64_t val)
28 {
29  vector<float64_t> result(size);
30  fill(result.begin(), result.end(), val);
31  return result;
32 }
33 
34 static void plane_rot(float64_t x0, float64_t x1,
36 {
37  G.zero();
38  if (x1 == 0)
39  {
40  G(0, 0) = G(1, 1) = 1;
41  y0 = x0;
42  y1 = x1;
43  }
44  else
45  {
46  float64_t r = CMath::sqrt(x0*x0+x1*x1);
47  float64_t sx0 = x0 / r;
48  float64_t sx1 = x1 / r;
49 
50  G(0,0) = sx0;
51  G(1,0) = -sx1;
52  G(0,1) = sx1;
53  G(1,1) = sx0;
54 
55  y0 = r;
56  y1 = 0;
57  }
58 }
59 
60 static void find_max_abs(const vector<float64_t> &vec, const vector<bool> &ignore_mask,
61  int32_t &imax, float64_t& vmax)
62 {
63  imax = -1;
64  vmax = -1;
65  for (uint32_t i=0; i < vec.size(); ++i)
66  {
67  if (ignore_mask[i])
68  continue;
69 
70  if (CMath::abs(vec[i]) > vmax)
71  {
72  vmax = CMath::abs(vec[i]);
73  imax = i;
74  }
75  }
76 }
77 
78 CLeastAngleRegression::CLeastAngleRegression(bool lasso) :
79  CLinearMachine(), m_lasso(lasso),
80  m_max_nonz(0), m_max_l1_norm(0)
81 {
82  m_epsilon = CMath::MACHINE_EPSILON;
83  SG_ADD(&m_epsilon, "epsilon", "Epsilon for early stopping", MS_AVAILABLE);
84  SG_ADD(&m_max_nonz, "max_nonz", "Max number of non-zero variables", MS_AVAILABLE);
85  SG_ADD(&m_max_l1_norm, "max_l1_norm", "Max l1-norm of estimator", MS_AVAILABLE);
86 }
87 
89 {
90 
91 }
92 
94 {
95  if (!m_labels)
96  SG_ERROR("No labels set\n")
98  SG_ERROR("Expected RegressionLabels\n")
99 
100  if (!data)
101  data=features;
102 
103  if (!data)
104  SG_ERROR("No features set\n")
105 
106  if (m_labels->get_num_labels() != data->get_num_vectors())
107  SG_ERROR("Number of training vectors does not match number of labels\n")
108 
109  if (data->get_feature_class() != C_DENSE)
110  SG_ERROR("Expected Simple Features\n")
111 
112  if (data->get_feature_type() != F_DREAL)
113  SG_ERROR("Expected Real Features\n")
114 
116  int32_t n_fea = feats->get_num_features();
117  int32_t n_vec = feats->get_num_vectors();
118 
119  bool lasso_cond = false;
120  bool stop_cond = false;
121 
122  // init facilities
123  m_beta_idx.clear();
124  m_beta_path.clear();
125  m_num_active = 0;
126  m_active_set.clear();
127  m_is_active.resize(n_fea);
128  fill(m_is_active.begin(), m_is_active.end(), false);
129 
130  SGVector<float64_t> y = ((CRegressionLabels*) m_labels)->get_labels();
131  Map<VectorXd> map_y(y.vector, y.size());
132  SGMatrix<float64_t> Xorig = feats->get_feature_matrix();
133 
134  // transpose(X) is more convenient to work with since we care
135  // about features here. After transpose, each row will be a data
136  // point while each column corresponds to a feature
137  SGMatrix<float64_t> X (n_vec, n_fea);
138  Map<MatrixXd> map_Xr(Xorig.matrix, n_fea, n_vec);
139  Map<MatrixXd> map_X(X.matrix, n_vec, n_fea);
140  map_X = map_Xr.transpose();
141 
142  SGMatrix<float64_t> X_active(n_vec, n_fea);
143 
144  // beta is the estimator
145  vector<float64_t> beta = make_vector(n_fea, 0);
146 
147  vector<float64_t> Xy = make_vector(n_fea, 0);
148  Map<VectorXd> map_Xy(&Xy[0], n_fea);
149  // Xy = X' * y
150  map_Xy=map_Xr*map_y;
151 
152  // mu is the prediction
153  vector<float64_t> mu = make_vector(n_vec, 0);
154 
155  // correlation
156  vector<float64_t> corr = make_vector(n_fea, 0);
157  // sign of correlation
158  vector<float64_t> corr_sign(n_fea);
159 
160  // Cholesky factorization R'R = X'X, R is upper triangular
162 
163  float64_t max_corr = 1;
164  int32_t i_max_corr = 1;
165 
166  // first entry: all coefficients are zero
167  m_beta_path.push_back(beta);
168  m_beta_idx.push_back(0);
169 
170  //maximum allowed active variables at a time
171  int32_t max_active_allowed = CMath::min(n_vec-1, n_fea);
172 
173  //========================================
174  // main loop
175  //========================================
176  int32_t nloop=0;
177  while (m_num_active < max_active_allowed && max_corr/n_vec > m_epsilon && !stop_cond)
178  {
179  // corr = X' * (y-mu) = - X'*mu + Xy
180  Map<VectorXd> map_corr(&corr[0], n_fea);
181  Map<VectorXd> map_mu(&mu[0], n_vec);
182 
183  map_corr = map_Xy - (map_Xr*map_mu);
184 
185  // corr_sign = sign(corr)
186  for (uint32_t i=0; i < corr.size(); ++i)
187  corr_sign[i] = CMath::sign(corr[i]);
188 
189  // find max absolute correlation in inactive set
190  find_max_abs(corr, m_is_active, i_max_corr, max_corr);
191 
192  if (!lasso_cond)
193  {
194  // update Cholesky factorization matrix
195  if (m_num_active == 0)
196  {
197  // R isn't allocated yet
198  R=SGMatrix<float64_t>(1,1);
199  float64_t diag_k = map_X.col(i_max_corr).dot(map_X.col(i_max_corr));
200  R(0,0) = CMath::sqrt(diag_k);
201  }
202  else
203  R=cholesky_insert(X, X_active, R, i_max_corr, m_num_active);
204  activate_variable(i_max_corr);
205  }
206 
207  // Active variables
208  Map<MatrixXd> map_Xa(X_active.matrix, n_vec, m_num_active);
209  if (!lasso_cond)
210  map_Xa.col(m_num_active-1)=map_X.col(i_max_corr);
211 
212  SGVector<float64_t> corr_sign_a(m_num_active);
213  for (int32_t i=0; i < m_num_active; ++i)
214  corr_sign_a[i] = corr_sign[m_active_set[i]];
215 
216  Map<VectorXd> map_corr_sign_a(corr_sign_a.vector, corr_sign_a.size());
217  Map<MatrixXd> map_R(R.matrix, R.num_rows, R.num_cols);
218  VectorXd solve = map_R.transpose().triangularView<Lower>().solve<OnTheLeft>(map_corr_sign_a);
219  VectorXd GA1 = map_R.triangularView<Upper>().solve<OnTheLeft>(solve);
220 
221  // AA = 1/sqrt(GA1' * corr_sign_a)
222  float64_t AA = GA1.dot(map_corr_sign_a);
223  AA = 1/CMath::sqrt(AA);
224 
225  VectorXd wA = AA*GA1;
226 
227  // equiangular direction (unit vector)
228  vector<float64_t> u = make_vector(n_vec, 0);
229  Map<VectorXd> map_u(&u[0], n_vec);
230 
231  map_u = map_Xa*wA;
232 
233  float64_t gamma = max_corr / AA;
234  if (m_num_active < n_fea)
235  {
236  #pragma omp parallel for shared(gamma)
237  for (int32_t i=0; i < n_fea; ++i)
238  {
239  if (m_is_active[i])
240  continue;
241 
242  // correlation between X[:,i] and u
243  float64_t dir_corr = map_u.dot(map_X.col(i));
244 
245  float64_t tmp1 = (max_corr-corr[i])/(AA-dir_corr);
246  float64_t tmp2 = (max_corr+corr[i])/(AA+dir_corr);
247  #pragma omp critical
248  {
249  if (tmp1 > CMath::MACHINE_EPSILON && tmp1 < gamma)
250  gamma = tmp1;
251  if (tmp2 > CMath::MACHINE_EPSILON && tmp2 < gamma)
252  gamma = tmp2;
253  }
254  }
255  }
256  int32_t i_kick=-1;
257  int32_t i_change=i_max_corr;
258  if (m_lasso)
259  {
260  // lasso modification to further refine gamma
261  lasso_cond = false;
263 
264  for (int32_t i=0; i < m_num_active; ++i)
265  {
266  float64_t tmp = -beta[m_active_set[i]] / wA(i);
267  if (tmp > CMath::MACHINE_EPSILON && tmp < lasso_bound)
268  {
269  lasso_bound = tmp;
270  i_kick = i;
271  }
272  }
273 
274  if (lasso_bound < gamma)
275  {
276  gamma = lasso_bound;
277  lasso_cond = true;
278  i_change = m_active_set[i_kick];
279  }
280  }
281 
282  // update prediction: mu = mu + gamma * u
283  map_mu += gamma*map_u;
284 
285  // update estimator
286  for (int32_t i=0; i < m_num_active; ++i)
287  beta[m_active_set[i]] += gamma * wA(i);
288  // early stopping on max l1-norm
289  if (m_max_l1_norm > 0)
290  {
291  float64_t l1 = SGVector<float64_t>::onenorm(&beta[0], n_fea);
292  if (l1 > m_max_l1_norm)
293  {
294  // stopping with interpolated beta
295  stop_cond = true;
296  lasso_cond = false;
297  float64_t l1_prev = SGVector<float64_t>::onenorm(&m_beta_path[nloop][0], n_fea);
298  float64_t s = (m_max_l1_norm-l1_prev)/(l1-l1_prev);
299 
300  Map<VectorXd> map_beta(&beta[0], n_fea);
301  Map<VectorXd> map_beta_prev(&m_beta_path[nloop][0], n_fea);
302  map_beta = (1-s)*map_beta_prev + s*map_beta;
303  }
304  }
305 
306  // if lasso cond, drop the variable from active set
307  if (lasso_cond)
308  {
309  beta[i_change] = 0;
310  R=cholesky_delete(R, i_kick);
311  deactivate_variable(i_kick);
312 
313  // Remove column from active set
314  int32_t numRows = map_Xa.rows();
315  int32_t numCols = map_Xa.cols()-1;
316  if( i_kick < numCols )
317  map_Xa.block(0, i_kick, numRows, numCols-i_kick) =
318  map_Xa.block(0, i_kick+1, numRows, numCols-i_kick).eval();
319  }
320 
321  nloop++;
322  m_beta_path.push_back(beta);
323  if (size_t(m_num_active) >= m_beta_idx.size())
324  m_beta_idx.push_back(nloop);
325  else
326  m_beta_idx[m_num_active] = nloop;
327 
328  // early stopping with max number of non-zero variables
329  if (m_max_nonz > 0 && m_num_active >= m_max_nonz)
330  stop_cond = true;
331  SG_DEBUG("Added : %d , Dropped %d, Active set size %d max_corr %.17f \n", i_max_corr, i_kick, m_num_active, max_corr);
332 
333  }
334 
335  // assign default estimator
336  w.vlen = n_fea;
337  switch_w(m_beta_idx.size()-1);
338 
339  return true;
340 }
341 
343  const SGMatrix<float64_t>& X_active, SGMatrix<float64_t>& R, int32_t i_max_corr, int32_t num_active)
344 {
345  Map<MatrixXd> map_X(X.matrix, X.num_rows, X.num_cols);
346  Map<MatrixXd> map_X_active(X_active.matrix, X.num_rows, num_active);
347  float64_t diag_k = map_X.col(i_max_corr).dot(map_X.col(i_max_corr));
348 
349  // col_k is the k-th column of (X'X)
350  Map<VectorXd> map_i_max(X.get_column_vector(i_max_corr), X.num_rows);
351  VectorXd R_k = map_X_active.transpose()*map_i_max;
352  Map<MatrixXd> map_R(R.matrix, R.num_rows, R.num_cols);
353 
354  // R' * R_k = (X' * X)_k = col_k, solving to get R_k
355  map_R.transpose().triangularView<Lower>().solveInPlace<OnTheLeft>(R_k);
356  float64_t R_kk = CMath::sqrt(diag_k - R_k.dot(R_k));
357 
358  SGMatrix<float64_t> R_new(num_active+1, num_active+1);
359  Map<MatrixXd> map_R_new(R_new.matrix, R_new.num_rows, R_new.num_cols);
360 
361  map_R_new.block(0, 0, num_active, num_active) = map_R;
362  memcpy(R_new.matrix+num_active*(num_active+1), R_k.data(), sizeof(float64_t)*(num_active));
363  map_R_new.row(num_active).setZero();
364  map_R_new(num_active, num_active) = R_kk;
365  return R_new;
366 }
367 
369 {
370  if (i_kick != m_num_active-1)
371  {
372  // remove i_kick-th column
373  for (int32_t j=i_kick; j < m_num_active-1; ++j)
374  for (int32_t i=0; i < m_num_active; ++i)
375  R(i,j) = R(i,j+1);
376 
377  SGMatrix<float64_t> G(2,2);
378  for (int32_t i=i_kick; i < m_num_active-1; ++i)
379  {
380  plane_rot(R(i,i),R(i+1,i), R(i,i), R(i+1,i), G);
381  if (i < m_num_active-2)
382  {
383  for (int32_t k=i+1; k < m_num_active-1; ++k)
384  {
385  // R[i:i+1, k] = G*R[i:i+1, k]
386  float64_t Rik = R(i,k), Ri1k = R(i+1,k);
387  R(i,k) = G(0,0)*Rik + G(0,1)*Ri1k;
388  R(i+1,k) = G(1,0)*Rik+G(1,1)*Ri1k;
389  }
390  }
391  }
392  }
393 
394  SGMatrix<float64_t> nR(m_num_active-1, m_num_active-1);
395  for (int32_t i=0; i < m_num_active-1; ++i)
396  for (int32_t j=0; j < m_num_active-1; ++j)
397  nR(i,j) = R(i,j);
398 
399  return nR;
400 }
401 
static const float64_t MACHINE_EPSILON
Definition: Math.h:2058
virtual ELabelType get_label_type() const =0
Real Labels are real-valued labels.
int32_t get_num_features() const
static vector< float64_t > make_vector(int32_t size, float64_t val)
virtual int32_t get_num_labels() const =0
real valued labels (e.g. for regression, classifier outputs)
Definition: LabelTypes.h:22
SGMatrix< ST > get_feature_matrix()
Definition: SGMatrix.h:20
virtual int32_t get_num_vectors() const =0
CLabels * m_labels
Definition: Machine.h:361
#define SG_ERROR(...)
Definition: SGIO.h:129
index_t num_cols
Definition: SGMatrix.h:376
static void find_max_abs(const vector< float64_t > &vec, const vector< bool > &ignore_mask, int32_t &imax, float64_t &vmax)
SGMatrix< float64_t > cholesky_insert(const SGMatrix< float64_t > &X, const SGMatrix< float64_t > &X_active, SGMatrix< float64_t > &R, int32_t i_max_corr, int32_t num_active)
index_t num_rows
Definition: SGMatrix.h:374
virtual bool train_machine(CFeatures *data=NULL)
SGMatrix< float64_t > cholesky_delete(SGMatrix< float64_t > &R, int32_t i_kick)
int32_t size() const
Definition: SGVector.h:113
index_t vlen
Definition: SGVector.h:492
void switch_w(int32_t num_variable)
static void plane_rot(float64_t x0, float64_t x1, float64_t &y0, float64_t &y1, SGMatrix< float64_t > &G)
virtual int32_t get_num_vectors() const
double float64_t
Definition: common.h:50
SGVector< float64_t > w
virtual EFeatureClass get_feature_class() const =0
T * get_column_vector(index_t col) const
Definition: SGMatrix.h:113
Class LinearMachine is a generic interface for all kinds of linear machines like classifiers.
Definition: LinearMachine.h:63
CDotFeatures * features
#define SG_DEBUG(...)
Definition: SGIO.h:107
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
static T sign(T a)
Definition: Math.h:426
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
static float64_t onenorm(T *x, int32_t len)
|| x ||_1
Definition: SGVector.cpp:713
Matrix::Scalar max(Matrix m)
Definition: Redux.h:68
#define SG_ADD(...)
Definition: SGObject.h:81
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual EFeatureType get_feature_type() const =0

SHOGUN Machine Learning Toolbox - Documentation