00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/preprocessor/PCA.h>
00013 #ifdef HAVE_LAPACK
00014 #include <shogun/mathematics/lapack.h>
00015 #include <shogun/lib/config.h>
00016 #include <shogun/mathematics/Math.h>
00017 #include <string.h>
00018 #include <stdlib.h>
00019 #include <shogun/lib/common.h>
00020 #include <shogun/preprocessor/SimplePreprocessor.h>
00021 #include <shogun/features/Features.h>
00022 #include <shogun/features/SimpleFeatures.h>
00023 #include <shogun/io/SGIO.h>
00024
00025 using namespace shogun;
00026
00027 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_)
00028 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false),
00029 m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_)
00030 {
00031 init();
00032 }
00033
00034 void CPCA::init()
00035 {
00036 m_transformation_matrix = SGMatrix<float64_t>(NULL,0,0,true);
00037 m_mean_vector = SGVector<float64_t>(NULL,0,true);
00038 m_eigenvalues_vector = SGVector<float64_t>(NULL,0,true);
00039
00040
00041 m_parameters->add(&m_transformation_matrix,
00042 "transformation matrix", "Transformation matrix (Eigenvectors of covariance matrix).");
00043 m_parameters->add(&m_mean_vector,
00044 "mean vector", "Mean Vector.");
00045 m_parameters->add(&m_eigenvalues_vector,
00046 "eigenvalues vector", "Vector with Eigenvalues.");
00047 m_parameters->add(&m_initialized,
00048 "initalized", "True when initialized.");
00049 m_parameters->add(&m_whitening,
00050 "whitening", "Whether data shall be whitened.");
00051 m_parameters->add((machine_int_t*) &m_mode, "mode",
00052 "PCA Mode.");
00053 m_parameters->add(&thresh,
00054 "thresh", "Cutoff threshold.");
00055 }
00056
00057 CPCA::~CPCA()
00058 {
00059 m_transformation_matrix.destroy_matrix();
00060 m_mean_vector.destroy_vector();
00061 m_eigenvalues_vector.destroy_vector();
00062 }
00063
00064 bool CPCA::init(CFeatures* features)
00065 {
00066 if (!m_initialized)
00067 {
00068
00069 int32_t i,j,k;
00070
00071 ASSERT(features->get_feature_class()==C_SIMPLE);
00072 ASSERT(features->get_feature_type()==F_DREAL);
00073
00074 int32_t num_vectors=((CSimpleFeatures<float64_t>*)features)->get_num_vectors();
00075 int32_t num_features=((CSimpleFeatures<float64_t>*)features)->get_num_features();
00076 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00077
00078 m_mean_vector.vlen = num_features;
00079 m_mean_vector.vector = SG_CALLOC(float64_t, num_features);
00080
00081
00082 SGMatrix<float64_t> feature_matrix = ((CSimpleFeatures<float64_t>*)features)->get_feature_matrix();
00083 for (i=0; i<num_vectors; i++)
00084 {
00085 for (j=0; j<num_features; j++)
00086 m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j];
00087 }
00088
00089
00090 for (i=0; i<num_features; i++)
00091 m_mean_vector.vector[i] /= num_vectors;
00092
00093 float64_t* cov = SG_CALLOC(float64_t, num_features*num_features);
00094
00095 float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
00096
00097 for (i=0; i<num_vectors; i++)
00098 {
00099 for (k=0; k<num_features; k++)
00100 sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k];
00101
00102 cblas_dger(CblasColMajor,
00103 num_features,num_features,
00104 1.0,sub_mean,1,
00105 sub_mean,1,
00106 cov, num_features);
00107 }
00108
00109 SG_FREE(sub_mean);
00110
00111 for (i=0; i<num_features; i++)
00112 {
00113 for (j=0; j<num_features; j++)
00114 cov[i*num_features+j]/=(num_vectors-1);
00115 }
00116
00117 SG_INFO("Computing Eigenvalues ... ") ;
00118
00119 m_eigenvalues_vector.vector = CMath::compute_eigenvectors(cov,num_features,num_features);
00120 m_eigenvalues_vector.vlen = num_features;
00121 num_dim=0;
00122
00123 if (m_mode == FIXED_NUMBER)
00124 {
00125 ASSERT(m_target_dim <= num_features);
00126 num_dim = m_target_dim;
00127 }
00128 if (m_mode == VARIANCE_EXPLAINED)
00129 {
00130 float64_t eig_sum = 0;
00131 for (i=0; i<num_features; i++)
00132 eig_sum += m_eigenvalues_vector.vector[i];
00133
00134 float64_t com_sum = 0;
00135 for (i=num_features-1; i>-1; i--)
00136 {
00137 num_dim++;
00138 com_sum += m_eigenvalues_vector.vector[i];
00139 if (com_sum/eig_sum>=thresh)
00140 break;
00141 }
00142 }
00143 if (m_mode == THRESHOLD)
00144 {
00145 for (i=num_features-1; i>-1; i--)
00146 {
00147 if (m_eigenvalues_vector.vector[i]>thresh)
00148 num_dim++;
00149 else
00150 break;
00151 }
00152 }
00153
00154 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00155
00156 m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim);
00157 num_old_dim = num_features;
00158
00159 int32_t offs=0;
00160 for (i=num_features-num_dim; i<num_features; i++)
00161 {
00162 for (k=0; k<num_features; k++)
00163 if (m_whitening)
00164 m_transformation_matrix.matrix[offs+k*num_dim] =
00165 cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]);
00166 else
00167 m_transformation_matrix.matrix[offs+k*num_dim] =
00168 cov[num_features*i+k];
00169 offs++;
00170 }
00171
00172 SG_FREE(cov);
00173 m_initialized = true;
00174 return true;
00175 }
00176
00177 return false;
00178 }
00179
00180 void CPCA::cleanup()
00181 {
00182 m_transformation_matrix.destroy_matrix();
00183 m_mean_vector.destroy_vector();
00184 m_eigenvalues_vector.destroy_vector();
00185 }
00186
00187 SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
00188 {
00189 ASSERT(m_initialized);
00190 SGMatrix<float64_t> m = ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix();
00191 int32_t num_vectors = m.num_cols;
00192 int32_t num_features = m.num_rows;
00193 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features);
00194
00195 if (m.matrix)
00196 {
00197 SG_INFO("Preprocessing feature matrix\n");
00198 float64_t* res = SG_MALLOC(float64_t, num_dim);
00199 float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
00200
00201 for (int32_t vec=0; vec<num_vectors; vec++)
00202 {
00203 int32_t i;
00204
00205 for (i=0; i<num_features; i++)
00206 sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i];
00207
00208 cblas_dgemv(CblasColMajor,CblasNoTrans,
00209 num_dim,num_features,
00210 1.0,m_transformation_matrix.matrix,num_dim,
00211 sub_mean,1,
00212 0.0,res,1);
00213
00214 float64_t* m_transformed = &m.matrix[num_dim*vec];
00215
00216 for (i=0; i<num_dim; i++)
00217 m_transformed[i] = res[i];
00218 }
00219 SG_FREE(res);
00220 SG_FREE(sub_mean);
00221
00222 ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_dim);
00223 ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00224 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00225 }
00226
00227 return m;
00228 }
00229
00230 SGVector<float64_t> CPCA::apply_to_feature_vector(SGVector<float64_t> vector)
00231 {
00232 float64_t* result = SG_MALLOC(float64_t, num_dim);
00233 float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen);
00234
00235 for (int32_t i=0; i<vector.vlen; i++)
00236 sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i];
00237
00238 cblas_dgemv(CblasColMajor,CblasNoTrans,
00239 num_dim,vector.vlen,
00240 1.0,m_transformation_matrix.matrix,m_transformation_matrix.num_cols,
00241 sub_mean,1,
00242 0.0,result,1);
00243
00244 SG_FREE(sub_mean);
00245 return SGVector<float64_t>(result,num_dim);
00246 }
00247
00248 SGMatrix<float64_t> CPCA::get_transformation_matrix()
00249 {
00250 return m_transformation_matrix;
00251 }
00252
00253 SGVector<float64_t> CPCA::get_eigenvalues()
00254 {
00255 return m_eigenvalues_vector;
00256 }
00257
00258 SGVector<float64_t> CPCA::get_mean()
00259 {
00260 return m_mean_vector;
00261 }
00262
00263 #endif