SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
PCA.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2008,2011 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  * Copyright (C) 2011 Berlin Institute of Technology
11  */
13 #ifdef HAVE_LAPACK
15 #include <shogun/lib/config.h>
17 #include <string.h>
18 #include <stdlib.h>
19 #include <shogun/lib/common.h>
22 #include <shogun/io/SGIO.h>
23 
24 using namespace shogun;
25 
26 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_)
27 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false),
28  m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_)
29 {
30  init();
31 }
32 
33 void CPCA::init()
34 {
38 
39  SG_ADD(&m_transformation_matrix, "transformation matrix",
40  "Transformation matrix (Eigenvectors of covariance matrix).",
42  SG_ADD(&m_mean_vector, "mean vector", "Mean Vector.", MS_NOT_AVAILABLE);
43  SG_ADD(&m_eigenvalues_vector, "eigenvalues vector",
44  "Vector with Eigenvalues.", MS_NOT_AVAILABLE);
45  SG_ADD(&m_initialized, "initalized", "True when initialized.",
47  SG_ADD(&m_whitening, "whitening", "Whether data shall be whitened.",
48  MS_AVAILABLE);
49  SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE);
50  SG_ADD(&thresh, "thresh", "Cutoff threshold.", MS_AVAILABLE);
51 }
52 
54 {
55 }
56 
57 bool CPCA::init(CFeatures* features)
58 {
59  if (!m_initialized)
60  {
61  // loop varibles
62  int32_t i,j,k;
63 
64  ASSERT(features->get_feature_class()==C_DENSE);
65  ASSERT(features->get_feature_type()==F_DREAL);
66 
67  int32_t num_vectors=((CDenseFeatures<float64_t>*)features)->get_num_vectors();
68  int32_t num_features=((CDenseFeatures<float64_t>*)features)->get_num_features();
69  SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
70 
71  m_mean_vector.vlen = num_features;
72  m_mean_vector.vector = SG_CALLOC(float64_t, num_features);
73 
74  // sum
75  SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)->get_feature_matrix();
76  for (i=0; i<num_vectors; i++)
77  {
78  for (j=0; j<num_features; j++)
79  m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j];
80  }
81 
82  //divide
83  for (i=0; i<num_features; i++)
84  m_mean_vector.vector[i] /= num_vectors;
85 
86  float64_t* cov = SG_CALLOC(float64_t, num_features*num_features);
87 
88  float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
89 
90  for (i=0; i<num_vectors; i++)
91  {
92  for (k=0; k<num_features; k++)
93  sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k];
94 
95  cblas_dger(CblasColMajor,
96  num_features,num_features,
97  1.0,sub_mean,1,
98  sub_mean,1,
99  cov, num_features);
100  }
101 
102  SG_FREE(sub_mean);
103 
104  for (i=0; i<num_features; i++)
105  {
106  for (j=0; j<num_features; j++)
107  cov[i*num_features+j]/=(num_vectors-1);
108  }
109 
110  SG_INFO("Computing Eigenvalues ... ") ;
111 
113  m_eigenvalues_vector.vlen = num_features;
114  num_dim=0;
115 
116  if (m_mode == FIXED_NUMBER)
117  {
118  ASSERT(m_target_dim <= num_features);
120  }
121  if (m_mode == VARIANCE_EXPLAINED)
122  {
123  float64_t eig_sum = 0;
124  for (i=0; i<num_features; i++)
125  eig_sum += m_eigenvalues_vector.vector[i];
126 
127  float64_t com_sum = 0;
128  for (i=num_features-1; i>-1; i--)
129  {
130  num_dim++;
131  com_sum += m_eigenvalues_vector.vector[i];
132  if (com_sum/eig_sum>=thresh)
133  break;
134  }
135  }
136  if (m_mode == THRESHOLD)
137  {
138  for (i=num_features-1; i>-1; i--)
139  {
141  num_dim++;
142  else
143  break;
144  }
145  }
146 
147  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
148 
150  num_old_dim = num_features;
151 
152  int32_t offs=0;
153  for (i=num_features-num_dim; i<num_features; i++)
154  {
155  for (k=0; k<num_features; k++)
156  if (m_whitening)
158  cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]);
159  else
161  cov[num_features*i+k];
162  offs++;
163  }
164 
165  SG_FREE(cov);
166  m_initialized = true;
167  return true;
168  }
169 
170  return false;
171 }
172 
174 {
176 }
177 
179 {
181  SGMatrix<float64_t> m = ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
182  int32_t num_vectors = m.num_cols;
183  int32_t num_features = m.num_rows;
184  SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features);
185 
186  if (m.matrix)
187  {
188  SG_INFO("Preprocessing feature matrix\n");
190  float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
191 
192  for (int32_t vec=0; vec<num_vectors; vec++)
193  {
194  int32_t i;
195 
196  for (i=0; i<num_features; i++)
197  sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i];
198 
199  cblas_dgemv(CblasColMajor,CblasNoTrans,
200  num_dim,num_features,
202  sub_mean,1,
203  0.0,res,1);
204 
205  float64_t* m_transformed = &m.matrix[num_dim*vec];
206 
207  for (i=0; i<num_dim; i++)
208  m_transformed[i] = res[i];
209  }
210  SG_FREE(res);
211  SG_FREE(sub_mean);
212 
213  ((CDenseFeatures<float64_t>*) features)->set_num_features(num_dim);
214  ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
215  SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
216  }
217 
218  return m;
219 }
220 
222 {
223  float64_t* result = SG_MALLOC(float64_t, num_dim);
224  float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen);
225 
226  for (int32_t i=0; i<vector.vlen; i++)
227  sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i];
228 
229  cblas_dgemv(CblasColMajor,CblasNoTrans,
230  num_dim,vector.vlen,
232  sub_mean,1,
233  0.0,result,1);
234 
235  SG_FREE(sub_mean);
236  return SGVector<float64_t>(result,num_dim);
237 }
238 
240 {
242 }
243 
245 {
246  return m_eigenvalues_vector;
247 }
248 
250 {
251  return m_mean_vector;
252 }
253 
254 #endif /* HAVE_LAPACK */

SHOGUN Machine Learning Toolbox - Documentation