SHOGUN: PCACut.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 #include "lib/Mathematics.h"
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #ifdef HAVE_LAPACK
00019 #include "lib/lapack.h"
00020 
00021 #include "lib/common.h"
00022 #include "preproc/PCACut.h"
00023 #include "preproc/SimplePreProc.h"
00024 #include "features/Features.h"
00025 #include "features/SimpleFeatures.h"
00026 #include "lib/io.h"
00027 
00028 using namespace shogun;
00029 
00030 CPCACut::CPCACut(int32_t do_whitening_, float64_t thresh_)
00031 : CSimplePreProc<float64_t>("PCACut", "PCAC"), T(NULL), num_dim(0), mean(NULL),
00032     initialized(false), do_whitening(do_whitening_), thresh(thresh_)
00033 {
00034 }
00035 
00036 CPCACut::~CPCACut()
00037 {
00038     delete[] T;
00039     delete[] mean;
00040 }
00041 
00043 bool CPCACut::init(CFeatures* f)
00044 {
00045     if (!initialized)
00046     {
00047         ASSERT(f->get_feature_class()==C_SIMPLE);
00048         ASSERT(f->get_feature_type()==F_DREAL);
00049 
00050         SG_INFO("calling CPCACut::init\n") ;
00051         int32_t num_vectors=((CSimpleFeatures<float64_t>*)f)->get_num_vectors() ;
00052         int32_t num_features=((CSimpleFeatures<float64_t>*)f)->get_num_features() ;
00053         SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00054         delete[] mean ;
00055         mean=new float64_t[num_features+1] ;
00056 
00057         int32_t i,j;
00058 
00060 
00061         // clear
00062         for (j=0; j<num_features; j++)
00063             mean[j]=0 ; 
00064 
00065         // sum 
00066         for (i=0; i<num_vectors; i++)
00067         {
00068             int32_t len;
00069             bool free;
00070             float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free);
00071             for (j=0; j<num_features; j++)
00072                 mean[j]+= vec[j];
00073 
00074             ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free);
00075         }
00076 
00077         //divide
00078         for (j=0; j<num_features; j++)
00079             mean[j]/=num_vectors;
00080 
00081         SG_DONE();
00082         SG_DEBUG("Computing covariance matrix... of size %.2f M\n", num_features*num_features/1024.0/1024.0);
00083         float64_t *cov=new float64_t[num_features*num_features];
00084 
00085         for (j=0; j<num_features*num_features; j++)
00086             cov[j]=0.0 ;
00087 
00088         for (i=0; i<num_vectors; i++)
00089         {
00090             if (!(i % (num_vectors/10+1)))
00091                 SG_PROGRESS(i, 0, num_vectors);
00092 
00093             int32_t len;
00094             bool free;
00095 
00096             float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free) ;
00097 
00098             for (int32_t jj=0; jj<num_features; jj++)
00099                 vec[jj]-=mean[jj] ;
00100 
00102             int nf = (int) num_features; /* calling external lib */
00103             double* vec_double = (double*) vec; /* calling external lib */
00104             cblas_dger(CblasColMajor, nf, nf, 1.0, vec_double, 1, vec_double,
00105                 1, (double*) cov, nf);
00106 
00107             //for (int32_t k=0; k<num_features; k++)
00108             //  for (int32_t l=0; l<num_features; l++)
00109             //          cov[k*num_features+l]+=feature[l]*feature[k] ;
00110 
00111             ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free) ;
00112         }
00113 
00114         SG_DONE();
00115 
00116         for (i=0; i<num_features; i++)
00117             for (j=0; j<num_features; j++)
00118                 cov[i*num_features+j]/=num_vectors ;
00119 
00120         SG_DONE();
00121 
00122         SG_INFO("Computing Eigenvalues ... ") ;
00123         char V='V';
00124         char U='U';
00125         int32_t info;
00126         int32_t ord= num_features;
00127         int32_t lda= num_features;
00128         float64_t* eigenvalues=new float64_t[num_features] ;
00129 
00130         for (i=0; i<num_features; i++)
00131             eigenvalues[i]=0;
00132 
00133         // lapack sym matrix eigenvalues+vectors
00134         wrap_dsyev(V, U, (int) ord, (double*) cov, (int) lda,
00135             (double*) eigenvalues, (int*) &info);
00136 
00137 
00138         num_dim=0;
00139         for (i=0; i<num_features; i++)
00140         {
00141             //    SG_DEBUG( "EV[%i]=%e\n", i, values[i]) ;
00142             if (eigenvalues[i]>thresh)
00143                 num_dim++ ;
00144         } ;
00145 
00146         SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00147 
00148         delete[] T;
00149         T=new float64_t[num_dim*num_features];
00150         num_old_dim=num_features;
00151 
00152         if (do_whitening)
00153         {
00154             int32_t offs=0 ;
00155             for (i=0; i<num_features; i++)
00156             {
00157                 if (eigenvalues[i]>thresh)
00158                 {
00159                     for (int32_t jj=0; jj<num_features; jj++)
00160                         T[offs+jj*num_dim]=cov[num_features*i+jj]/sqrt(eigenvalues[i]) ;
00161                     offs++ ;
00162                 } ;
00163             }
00164         } ;
00165 
00166         delete[] eigenvalues;
00167         delete[] cov;
00168         initialized=true;
00169         SG_INFO("Done\n") ;
00170         return true ;
00171     }
00172     return 
00173         false;
00174 }
00175 
00177 void CPCACut::cleanup()
00178 {
00179     delete[] T ;
00180     T=NULL ;
00181 }
00182 
00186 float64_t* CPCACut::apply_to_feature_matrix(CFeatures* f)
00187 {
00188     int32_t num_vectors=0;
00189     int32_t num_features=0;
00190 
00191     float64_t* m=((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00192     SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) ;
00193 
00194     if (m)
00195     {
00196         SG_INFO("Preprocessing feature matrix\n");
00197         float64_t* res= new float64_t[num_dim];
00198         float64_t* sub_mean= new float64_t[num_features];
00199 
00200         for (int32_t vec=0; vec<num_vectors; vec++)
00201         {
00202             int32_t i;
00203 
00204             for (i=0; i<num_features; i++)
00205                 sub_mean[i]=m[num_features*vec+i]-mean[i] ;
00206 
00207             int nd = (int) num_dim; /* calling external lib */
00208             cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) num_features,
00209                 1.0, T, nd, (double*) sub_mean, 1, 0, (double*) res, 1);
00210 
00211             float64_t* m_transformed=&m[num_dim*vec];
00212             for (i=0; i<num_dim; i++)
00213                 m_transformed[i]=m[i];
00214         }
00215         delete[] res;
00216         delete[] sub_mean;
00217 
00218         ((CSimpleFeatures<float64_t>*) f)->set_num_features(num_dim);
00219         ((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00220         SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00221     }
00222 
00223     return m;
00224 }
00225 
00228 float64_t* CPCACut::apply_to_feature_vector(float64_t* f, int32_t &len)
00229 {
00230     float64_t *ret=new float64_t[num_dim];
00231     float64_t *sub_mean=new float64_t[len];
00232     for (int32_t i=0; i<len; i++)
00233         sub_mean[i]=f[i]-mean[i];
00234 
00235     int nd = (int) num_dim;  /* calling external lib */
00236     cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) len, 1.0, (double*) T,
00237         nd, (double*) sub_mean, 1, 0, (double*) ret, 1);
00238 
00239     delete[] sub_mean ;
00240     len=num_dim ;
00241     //    SG_DEBUG( "num_dim: %d\n", num_dim);
00242     return ret;
00243 }
00244 #endif