SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
PCA.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2008,2011 Soeren Sonnenburg
9  * Written (W) 2014 Parijat Mazumdar
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  * Copyright (C) 2011 Berlin Institute of Technology
12  */
13 #include <shogun/lib/config.h>
14 
19 #include <shogun/io/SGIO.h>
21 
22 using namespace shogun;
23 using namespace Eigen;
24 
25 CPCA::CPCA(bool do_whitening, EPCAMode mode, float64_t thresh, EPCAMethod method, EPCAMemoryMode mem_mode)
27 {
28  init();
29  m_whitening = do_whitening;
30  m_mode = mode;
31  m_thresh = thresh;
32  m_mem_mode = mem_mode;
33  m_method = method;
34 }
35 
36 CPCA::CPCA(EPCAMethod method, bool do_whitening, EPCAMemoryMode mem_mode)
38 {
39  init();
40  m_whitening = do_whitening;
41  m_mem_mode = mem_mode;
42  m_method = method;
43 }
44 
45 void CPCA::init()
46 {
50  num_dim = 0;
51  m_initialized = false;
52  m_whitening = false;
54  m_thresh = 1e-6;
56  m_method = AUTO;
58 
59  SG_ADD(&m_transformation_matrix, "transformation_matrix",
60  "Transformation matrix (Eigenvectors of covariance matrix).",
62  SG_ADD(&m_mean_vector, "mean_vector", "Mean Vector.", MS_NOT_AVAILABLE);
63  SG_ADD(&m_eigenvalues_vector, "eigenvalues_vector",
64  "Vector with Eigenvalues.", MS_NOT_AVAILABLE);
65  SG_ADD(&m_initialized, "initalized", "True when initialized.",
67  SG_ADD(&m_whitening, "whitening", "Whether data shall be whitened.",
68  MS_AVAILABLE);
69  SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE);
70  SG_ADD(&m_thresh, "m_thresh", "Cutoff threshold.", MS_AVAILABLE);
71  SG_ADD((machine_int_t*) &m_mem_mode, "m_mem_mode",
72  "Memory mode (in-place or reallocation).", MS_NOT_AVAILABLE);
73  SG_ADD((machine_int_t*) &m_method, "m_method",
74  "Method used for PCA calculation", MS_NOT_AVAILABLE);
75  SG_ADD(&m_eigenvalue_zero_tolerance, "eigenvalue_zero_tolerance", "zero tolerance"
76  " for determining zero eigenvalues during whitening to avoid numerical issues", MS_NOT_AVAILABLE);
77 }
78 
80 {
81 }
82 
83 bool CPCA::init(CFeatures* features)
84 {
85  if (!m_initialized)
86  {
87  REQUIRE(features->get_feature_class()==C_DENSE, "PCA only works with dense features")
88  REQUIRE(features->get_feature_type()==F_DREAL, "PCA only works with real features")
89 
90  SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)
91  ->get_feature_matrix();
92  int32_t num_vectors = feature_matrix.num_cols;
93  int32_t num_features = feature_matrix.num_rows;
94  SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features)
95 
96  // max target dim allowed
97  int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
98  num_dim=0;
99 
100  REQUIRE(m_target_dim<=max_dim_allowed,
101  "target dimension should be less or equal to than minimum of N and D")
102 
103  // center data
104  Map<MatrixXd> fmatrix(feature_matrix.matrix, num_features, num_vectors);
105  m_mean_vector = SGVector<float64_t>(num_features);
106  Map<VectorXd> data_mean(m_mean_vector.vector, num_features);
107  data_mean = fmatrix.rowwise().sum()/(float64_t) num_vectors;
108  fmatrix = fmatrix.colwise()-data_mean;
109 
110  m_eigenvalues_vector = SGVector<float64_t>(max_dim_allowed);
111  Map<VectorXd> eigenValues(m_eigenvalues_vector.vector, max_dim_allowed);
112 
113  if (m_method == AUTO)
114  m_method = (num_vectors>num_features) ? EVD : SVD;
115 
116  if (m_method == EVD)
117  {
118  // covariance matrix
119  MatrixXd cov_mat(num_features, num_features);
120  cov_mat = fmatrix*fmatrix.transpose();
121  cov_mat /= (num_vectors-1);
122 
123  SG_INFO("Computing Eigenvalues ... ")
124  // eigen value computed
125  SelfAdjointEigenSolver<MatrixXd> eigenSolve =
126  SelfAdjointEigenSolver<MatrixXd>(cov_mat);
127  eigenValues = eigenSolve.eigenvalues().tail(max_dim_allowed);
128 
129  // target dimension
130  switch (m_mode)
131  {
132  case FIXED_NUMBER :
134  break;
135 
136  case VARIANCE_EXPLAINED :
137  {
138  float64_t eig_sum = eigenValues.sum();
139  float64_t com_sum = 0;
140  for (int32_t i=num_features-1; i<-1; i++)
141  {
142  num_dim++;
143  com_sum += m_eigenvalues_vector.vector[i];
144  if (com_sum/eig_sum>=m_thresh)
145  break;
146  }
147  }
148  break;
149 
150  case THRESHOLD :
151  for (int32_t i=num_features-1; i<-1; i++)
152  {
154  num_dim++;
155  else
156  break;
157  }
158  break;
159  };
160  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
161 
164  num_features, num_dim);
165  num_old_dim = num_features;
166 
167  // eigenvector matrix
168  transformMatrix = eigenSolve.eigenvectors().block(0,
169  num_features-num_dim, num_features,num_dim);
170  if (m_whitening)
171  {
172  for (int32_t i=0; i<num_dim; i++)
173  {
174  if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i+max_dim_allowed-num_dim],
176  {
177  SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
178  "Eigenvalue within a tolerance of %E around 0) at "
179  "dimension %d. Consider reducing its dimension.",
180  m_eigenvalue_zero_tolerance, i+max_dim_allowed-num_dim+1)
181 
182  transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
183  continue;
184  }
185 
186  transformMatrix.col(i) /=
187  CMath::sqrt(eigenValues[i+max_dim_allowed-num_dim]*(num_vectors-1));
188  }
189  }
190  }
191 
192  else
193  {
194  // compute SVD of data matrix
195  JacobiSVD<MatrixXd> svd(fmatrix.transpose(), ComputeThinU | ComputeThinV);
196 
197  // compute non-negative eigen values from singular values
198  eigenValues = svd.singularValues();
199  eigenValues = eigenValues.cwiseProduct(eigenValues)/(num_vectors-1);
200 
201  // target dimension
202  switch (m_mode)
203  {
204  case FIXED_NUMBER :
206  break;
207 
208  case VARIANCE_EXPLAINED :
209  {
210  float64_t eig_sum = eigenValues.sum();
211  float64_t com_sum = 0;
212  for (int32_t i=0; i<num_features; i++)
213  {
214  num_dim++;
215  com_sum += m_eigenvalues_vector.vector[i];
216  if (com_sum/eig_sum>=m_thresh)
217  break;
218  }
219  }
220  break;
221 
222  case THRESHOLD :
223  for (int32_t i=0; i<num_features; i++)
224  {
226  num_dim++;
227  else
228  break;
229  }
230  break;
231  };
232  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
233 
234  // right singular vectors form eigenvectors
236  Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix, num_features, num_dim);
237  num_old_dim = num_features;
238  transformMatrix = svd.matrixV().block(0, 0, num_features, num_dim);
239  if (m_whitening)
240  {
241  for (int32_t i=0; i<num_dim; i++)
242  {
243  if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i],
245  {
246  SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
247  "Eigenvalue within a tolerance of %E around 0) at "
248  "dimension %d. Consider reducing its dimension.",
250 
251  transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
252  continue;
253  }
254 
255  transformMatrix.col(i) /= CMath::sqrt(eigenValues[i]*(num_vectors-1));
256  }
257  }
258  }
259 
260  // restore feature matrix
261  fmatrix = fmatrix.colwise()+data_mean;
262  m_initialized = true;
263  return true;
264  }
265 
266  return false;
267 }
268 
270 {
274  m_initialized = false;
275 }
276 
278 {
280  ASSERT(features != NULL)
281  SGMatrix<float64_t> m = ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
282  int32_t num_vectors = m.num_cols;
283  int32_t num_features = m.num_rows;
284 
285  SG_INFO("Transforming feature matrix\n")
288 
289  if (m_mem_mode == MEM_IN_PLACE)
290  {
291  if (m.matrix)
292  {
293  SG_INFO("Preprocessing feature matrix\n")
294  Map<MatrixXd> feature_matrix(m.matrix, num_features, num_vectors);
295  VectorXd data_mean = feature_matrix.rowwise().sum()/(float64_t) num_vectors;
296  feature_matrix = feature_matrix.colwise()-data_mean;
297 
298  feature_matrix.block(0,0,num_dim,num_vectors) =
299  transform_matrix.transpose()*feature_matrix;
300 
301  SG_INFO("Form matrix of target dimension")
302  for (int32_t col=0; col<num_vectors; col++)
303  {
304  for (int32_t row=0; row<num_dim; row++)
305  m.matrix[col*num_dim+row] = feature_matrix(row,col);
306  }
307  m.num_rows = num_dim;
308  m.num_cols = num_vectors;
309  }
310 
311  ((CDenseFeatures<float64_t>*) features)->set_feature_matrix(m);
312  return m;
313  }
314  else
315  {
316  SGMatrix<float64_t> ret(num_dim, num_vectors);
317  Map<MatrixXd> ret_matrix(ret.matrix, num_dim, num_vectors);
318  if (m.matrix)
319  {
320  SG_INFO("Preprocessing feature matrix\n")
321  Map<MatrixXd> feature_matrix(m.matrix, num_features, num_vectors);
322  VectorXd data_mean = feature_matrix.rowwise().sum()/(float64_t) num_vectors;
323  feature_matrix = feature_matrix.colwise()-data_mean;
324 
325  ret_matrix = transform_matrix.transpose()*feature_matrix;
326  }
327  ((CDenseFeatures<float64_t>*) features)->set_feature_matrix(ret);
328  return ret;
329  }
330 }
331 
333 {
335  Map<VectorXd> resultVec(result.vector, num_dim);
336  Map<VectorXd> inputVec(vector.vector, vector.vlen);
337 
341 
342  inputVec = inputVec-mean;
343  resultVec = transformMat.transpose()*inputVec;
344  inputVec = inputVec+mean;
345 
346  return result;
347 }
348 
350 {
352 }
353 
355 {
356  return m_eigenvalues_vector;
357 }
358 
360 {
361  return m_mean_vector;
362 }
363 
365 {
366  return m_mem_mode;
367 }
368 
370 {
371  m_mem_mode = e;
372 }
373 
374 void CPCA::set_eigenvalue_zero_tolerance(float64_t eigenvalue_zero_tolerance)
375 {
376  m_eigenvalue_zero_tolerance = eigenvalue_zero_tolerance;
377 }
378 
380 {
382 }
383 
void set_memory_mode(EPCAMemoryMode e)
Definition: PCA.cpp:369
the class DimensionReductionPreprocessor, a base class for preprocessors used to lower the dimensiona...
EPCAMemoryMode m_mem_mode
Definition: PCA.h:220
#define SG_INFO(...)
Definition: SGIO.h:118
SGVector< float64_t > m_mean_vector
Definition: PCA.h:208
int32_t num_old_dim
Definition: PCA.h:206
EPCAMode m_mode
Definition: PCA.h:216
void set_eigenvalue_zero_tolerance(float64_t eigenvalue_zero_tolerance=1e-15)
Definition: PCA.cpp:374
EPCAMemoryMode
Definition: PCA.h:52
virtual void cleanup()
Definition: PCA.cpp:269
Definition: SGMatrix.h:20
#define REQUIRE(x,...)
Definition: SGIO.h:206
SGVector< float64_t > m_eigenvalues_vector
Definition: PCA.h:210
index_t num_cols
Definition: SGMatrix.h:376
float64_t m_thresh
Definition: PCA.h:218
float64_t get_eigenvalue_zero_tolerance() const
Definition: PCA.cpp:379
index_t num_rows
Definition: SGMatrix.h:374
virtual SGVector< float64_t > apply_to_feature_vector(SGVector< float64_t > vector)
Definition: PCA.cpp:332
CPCA(bool do_whitening=false, EPCAMode mode=FIXED_NUMBER, float64_t thresh=1e-6, EPCAMethod method=AUTO, EPCAMemoryMode mem_mode=MEM_REALLOCATE)
Definition: PCA.cpp:25
void init()
Definition: PCA.cpp:45
SGMatrix< float64_t > m_transformation_matrix
Definition: PCA.h:202
index_t vlen
Definition: SGVector.h:494
#define ASSERT(x)
Definition: SGIO.h:201
SGVector< float64_t > get_mean()
Definition: PCA.cpp:359
int32_t num_dim
Definition: PCA.h:204
bool m_initialized
Definition: PCA.h:212
double float64_t
Definition: common.h:50
bool m_whitening
Definition: PCA.h:214
EPCAMethod
Definition: PCA.h:26
EPCAMethod m_method
Definition: PCA.h:222
SGVector< float64_t > get_eigenvalues()
Definition: PCA.cpp:354
virtual EFeatureClass get_feature_class() const =0
virtual ~CPCA()
Definition: PCA.cpp:79
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
int machine_int_t
Definition: common.h:59
The class Features is the base class of all feature objects.
Definition: Features.h:68
static T min(T a, T b)
Definition: Math.h:157
EPCAMemoryMode get_memory_mode() const
Definition: PCA.cpp:364
EPCAMode
Definition: PCA.h:41
#define SG_WARNING(...)
Definition: SGIO.h:128
#define SG_ADD(...)
Definition: SGObject.h:84
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual SGMatrix< float64_t > apply_to_feature_matrix(CFeatures *features)
Definition: PCA.cpp:277
float64_t m_eigenvalue_zero_tolerance
Definition: PCA.h:227
virtual EFeatureType get_feature_type() const =0
SGMatrix< float64_t > get_transformation_matrix()
Definition: PCA.cpp:349

SHOGUN Machine Learning Toolbox - Documentation