00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013 #include "lib/Mathematics.h"
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #ifdef HAVE_LAPACK
00019 #include "lib/lapack.h"
00020
00021 #include "lib/common.h"
00022 #include "preproc/PCACut.h"
00023 #include "preproc/SimplePreProc.h"
00024 #include "features/Features.h"
00025 #include "features/SimpleFeatures.h"
00026 #include "lib/io.h"
00027
00028 using namespace shogun;
00029
00030 CPCACut::CPCACut(int32_t do_whitening_, float64_t thresh_)
00031 : CSimplePreProc<float64_t>("PCACut", "PCAC"), T(NULL), num_dim(0), mean(NULL),
00032 initialized(false), do_whitening(do_whitening_), thresh(thresh_)
00033 {
00034 }
00035
00036 CPCACut::~CPCACut()
00037 {
00038 delete[] T;
00039 delete[] mean;
00040 }
00041
00043 bool CPCACut::init(CFeatures* f)
00044 {
00045 if (!initialized)
00046 {
00047 ASSERT(f->get_feature_class()==C_SIMPLE);
00048 ASSERT(f->get_feature_type()==F_DREAL);
00049
00050 SG_INFO("calling CPCACut::init\n") ;
00051 int32_t num_vectors=((CSimpleFeatures<float64_t>*)f)->get_num_vectors() ;
00052 int32_t num_features=((CSimpleFeatures<float64_t>*)f)->get_num_features() ;
00053 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00054 delete[] mean ;
00055 mean=new float64_t[num_features+1] ;
00056
00057 int32_t i,j;
00058
00060
00061
00062 for (j=0; j<num_features; j++)
00063 mean[j]=0 ;
00064
00065
00066 for (i=0; i<num_vectors; i++)
00067 {
00068 int32_t len;
00069 bool free;
00070 float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free);
00071 for (j=0; j<num_features; j++)
00072 mean[j]+= vec[j];
00073
00074 ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free);
00075 }
00076
00077
00078 for (j=0; j<num_features; j++)
00079 mean[j]/=num_vectors;
00080
00081 SG_DONE();
00082 SG_DEBUG("Computing covariance matrix... of size %.2f M\n", num_features*num_features/1024.0/1024.0);
00083 float64_t *cov=new float64_t[num_features*num_features];
00084
00085 for (j=0; j<num_features*num_features; j++)
00086 cov[j]=0.0 ;
00087
00088 for (i=0; i<num_vectors; i++)
00089 {
00090 if (!(i % (num_vectors/10+1)))
00091 SG_PROGRESS(i, 0, num_vectors);
00092
00093 int32_t len;
00094 bool free;
00095
00096 float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free) ;
00097
00098 for (int32_t jj=0; jj<num_features; jj++)
00099 vec[jj]-=mean[jj] ;
00100
00102 int nf = (int) num_features;
00103 double* vec_double = (double*) vec;
00104 cblas_dger(CblasColMajor, nf, nf, 1.0, vec_double, 1, vec_double,
00105 1, (double*) cov, nf);
00106
00107
00108
00109
00110
00111 ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free) ;
00112 }
00113
00114 SG_DONE();
00115
00116 for (i=0; i<num_features; i++)
00117 for (j=0; j<num_features; j++)
00118 cov[i*num_features+j]/=num_vectors ;
00119
00120 SG_DONE();
00121
00122 SG_INFO("Computing Eigenvalues ... ") ;
00123 char V='V';
00124 char U='U';
00125 int32_t info;
00126 int32_t ord= num_features;
00127 int32_t lda= num_features;
00128 float64_t* eigenvalues=new float64_t[num_features] ;
00129
00130 for (i=0; i<num_features; i++)
00131 eigenvalues[i]=0;
00132
00133
00134 wrap_dsyev(V, U, (int) ord, (double*) cov, (int) lda,
00135 (double*) eigenvalues, (int*) &info);
00136
00137
00138 num_dim=0;
00139 for (i=0; i<num_features; i++)
00140 {
00141
00142 if (eigenvalues[i]>thresh)
00143 num_dim++ ;
00144 } ;
00145
00146 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00147
00148 delete[] T;
00149 T=new float64_t[num_dim*num_features];
00150 num_old_dim=num_features;
00151
00152 if (do_whitening)
00153 {
00154 int32_t offs=0 ;
00155 for (i=0; i<num_features; i++)
00156 {
00157 if (eigenvalues[i]>thresh)
00158 {
00159 for (int32_t jj=0; jj<num_features; jj++)
00160 T[offs+jj*num_dim]=cov[num_features*i+jj]/sqrt(eigenvalues[i]) ;
00161 offs++ ;
00162 } ;
00163 }
00164 } ;
00165
00166 delete[] eigenvalues;
00167 delete[] cov;
00168 initialized=true;
00169 SG_INFO("Done\n") ;
00170 return true ;
00171 }
00172 return
00173 false;
00174 }
00175
00177 void CPCACut::cleanup()
00178 {
00179 delete[] T ;
00180 T=NULL ;
00181 }
00182
00186 float64_t* CPCACut::apply_to_feature_matrix(CFeatures* f)
00187 {
00188 int32_t num_vectors=0;
00189 int32_t num_features=0;
00190
00191 float64_t* m=((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00192 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) ;
00193
00194 if (m)
00195 {
00196 SG_INFO("Preprocessing feature matrix\n");
00197 float64_t* res= new float64_t[num_dim];
00198 float64_t* sub_mean= new float64_t[num_features];
00199
00200 for (int32_t vec=0; vec<num_vectors; vec++)
00201 {
00202 int32_t i;
00203
00204 for (i=0; i<num_features; i++)
00205 sub_mean[i]=m[num_features*vec+i]-mean[i] ;
00206
00207 int nd = (int) num_dim;
00208 cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) num_features,
00209 1.0, T, nd, (double*) sub_mean, 1, 0, (double*) res, 1);
00210
00211 float64_t* m_transformed=&m[num_dim*vec];
00212 for (i=0; i<num_dim; i++)
00213 m_transformed[i]=m[i];
00214 }
00215 delete[] res;
00216 delete[] sub_mean;
00217
00218 ((CSimpleFeatures<float64_t>*) f)->set_num_features(num_dim);
00219 ((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00220 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00221 }
00222
00223 return m;
00224 }
00225
00228 float64_t* CPCACut::apply_to_feature_vector(float64_t* f, int32_t &len)
00229 {
00230 float64_t *ret=new float64_t[num_dim];
00231 float64_t *sub_mean=new float64_t[len];
00232 for (int32_t i=0; i<len; i++)
00233 sub_mean[i]=f[i]-mean[i];
00234
00235 int nd = (int) num_dim;
00236 cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) len, 1.0, (double*) T,
00237 nd, (double*) sub_mean, 1, 0, (double*) ret, 1);
00238
00239 delete[] sub_mean ;
00240 len=num_dim ;
00241
00242 return ret;
00243 }
00244 #endif