00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/preprocessor/PCA.h>
00013 #ifdef HAVE_LAPACK
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/lib/config.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <string.h>
00018 #include <stdlib.h>
00019 #include <shogun/lib/common.h>
00020 #include <shogun/preprocessor/DensePreprocessor.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/io/SGIO.h>
00023
00024 using namespace shogun;
00025
00026 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_)
00027 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false),
00028 m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_)
00029 {
00030 init();
00031 }
00032
00033 void CPCA::init()
00034 {
00035 m_transformation_matrix = SGMatrix<float64_t>();
00036 m_mean_vector = SGVector<float64_t>();
00037 m_eigenvalues_vector = SGVector<float64_t>();
00038
00039 SG_ADD(&m_transformation_matrix, "transformation matrix",
00040 "Transformation matrix (Eigenvectors of covariance matrix).",
00041 MS_NOT_AVAILABLE);
00042 SG_ADD(&m_mean_vector, "mean vector", "Mean Vector.", MS_NOT_AVAILABLE);
00043 SG_ADD(&m_eigenvalues_vector, "eigenvalues vector",
00044 "Vector with Eigenvalues.", MS_NOT_AVAILABLE);
00045 SG_ADD(&m_initialized, "initalized", "True when initialized.",
00046 MS_NOT_AVAILABLE);
00047 SG_ADD(&m_whitening, "whitening", "Whether data shall be whitened.",
00048 MS_AVAILABLE);
00049 SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE);
00050 SG_ADD(&thresh, "thresh", "Cutoff threshold.", MS_AVAILABLE);
00051 }
00052
00053 CPCA::~CPCA()
00054 {
00055 }
00056
00057 bool CPCA::init(CFeatures* features)
00058 {
00059 if (!m_initialized)
00060 {
00061
00062 int32_t i,j,k;
00063
00064 ASSERT(features->get_feature_class()==C_DENSE);
00065 ASSERT(features->get_feature_type()==F_DREAL);
00066
00067 int32_t num_vectors=((CDenseFeatures<float64_t>*)features)->get_num_vectors();
00068 int32_t num_features=((CDenseFeatures<float64_t>*)features)->get_num_features();
00069 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00070
00071 m_mean_vector.vlen = num_features;
00072 m_mean_vector.vector = SG_CALLOC(float64_t, num_features);
00073
00074
00075 SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)->get_feature_matrix();
00076 for (i=0; i<num_vectors; i++)
00077 {
00078 for (j=0; j<num_features; j++)
00079 m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j];
00080 }
00081
00082
00083 for (i=0; i<num_features; i++)
00084 m_mean_vector.vector[i] /= num_vectors;
00085
00086 float64_t* cov = SG_CALLOC(float64_t, num_features*num_features);
00087
00088 float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
00089
00090 for (i=0; i<num_vectors; i++)
00091 {
00092 for (k=0; k<num_features; k++)
00093 sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k];
00094
00095 cblas_dger(CblasColMajor,
00096 num_features,num_features,
00097 1.0,sub_mean,1,
00098 sub_mean,1,
00099 cov, num_features);
00100 }
00101
00102 SG_FREE(sub_mean);
00103
00104 for (i=0; i<num_features; i++)
00105 {
00106 for (j=0; j<num_features; j++)
00107 cov[i*num_features+j]/=(num_vectors-1);
00108 }
00109
00110 SG_INFO("Computing Eigenvalues ... ") ;
00111
00112 m_eigenvalues_vector.vector = SGMatrix<float64_t>::compute_eigenvectors(cov,num_features,num_features);
00113 m_eigenvalues_vector.vlen = num_features;
00114 num_dim=0;
00115
00116 if (m_mode == FIXED_NUMBER)
00117 {
00118 ASSERT(m_target_dim <= num_features);
00119 num_dim = m_target_dim;
00120 }
00121 if (m_mode == VARIANCE_EXPLAINED)
00122 {
00123 float64_t eig_sum = 0;
00124 for (i=0; i<num_features; i++)
00125 eig_sum += m_eigenvalues_vector.vector[i];
00126
00127 float64_t com_sum = 0;
00128 for (i=num_features-1; i>-1; i--)
00129 {
00130 num_dim++;
00131 com_sum += m_eigenvalues_vector.vector[i];
00132 if (com_sum/eig_sum>=thresh)
00133 break;
00134 }
00135 }
00136 if (m_mode == THRESHOLD)
00137 {
00138 for (i=num_features-1; i>-1; i--)
00139 {
00140 if (m_eigenvalues_vector.vector[i]>thresh)
00141 num_dim++;
00142 else
00143 break;
00144 }
00145 }
00146
00147 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00148
00149 m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim);
00150 num_old_dim = num_features;
00151
00152 int32_t offs=0;
00153 for (i=num_features-num_dim; i<num_features; i++)
00154 {
00155 for (k=0; k<num_features; k++)
00156 if (m_whitening)
00157 m_transformation_matrix.matrix[offs+k*num_dim] =
00158 cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]);
00159 else
00160 m_transformation_matrix.matrix[offs+k*num_dim] =
00161 cov[num_features*i+k];
00162 offs++;
00163 }
00164
00165 SG_FREE(cov);
00166 m_initialized = true;
00167 return true;
00168 }
00169
00170 return false;
00171 }
00172
00173 void CPCA::cleanup()
00174 {
00175 m_transformation_matrix=SGMatrix<float64_t>();
00176 }
00177
00178 SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
00179 {
00180 ASSERT(m_initialized);
00181 SGMatrix<float64_t> m = ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
00182 int32_t num_vectors = m.num_cols;
00183 int32_t num_features = m.num_rows;
00184 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features);
00185
00186 if (m.matrix)
00187 {
00188 SG_INFO("Preprocessing feature matrix\n");
00189 float64_t* res = SG_MALLOC(float64_t, num_dim);
00190 float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
00191
00192 for (int32_t vec=0; vec<num_vectors; vec++)
00193 {
00194 int32_t i;
00195
00196 for (i=0; i<num_features; i++)
00197 sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i];
00198
00199 cblas_dgemv(CblasColMajor,CblasNoTrans,
00200 num_dim,num_features,
00201 1.0,m_transformation_matrix.matrix,num_dim,
00202 sub_mean,1,
00203 0.0,res,1);
00204
00205 float64_t* m_transformed = &m.matrix[num_dim*vec];
00206
00207 for (i=0; i<num_dim; i++)
00208 m_transformed[i] = res[i];
00209 }
00210 SG_FREE(res);
00211 SG_FREE(sub_mean);
00212
00213 ((CDenseFeatures<float64_t>*) features)->set_num_features(num_dim);
00214 ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00215 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00216 }
00217
00218 return m;
00219 }
00220
00221 SGVector<float64_t> CPCA::apply_to_feature_vector(SGVector<float64_t> vector)
00222 {
00223 float64_t* result = SG_MALLOC(float64_t, num_dim);
00224 float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen);
00225
00226 for (int32_t i=0; i<vector.vlen; i++)
00227 sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i];
00228
00229 cblas_dgemv(CblasColMajor,CblasNoTrans,
00230 num_dim,vector.vlen,
00231 1.0,m_transformation_matrix.matrix,m_transformation_matrix.num_cols,
00232 sub_mean,1,
00233 0.0,result,1);
00234
00235 SG_FREE(sub_mean);
00236 return SGVector<float64_t>(result,num_dim);
00237 }
00238
00239 SGMatrix<float64_t> CPCA::get_transformation_matrix()
00240 {
00241 return m_transformation_matrix;
00242 }
00243
00244 SGVector<float64_t> CPCA::get_eigenvalues()
00245 {
00246 return m_eigenvalues_vector;
00247 }
00248
00249 SGVector<float64_t> CPCA::get_mean()
00250 {
00251 return m_mean_vector;
00252 }
00253
00254 #endif