SHOGUN: PruneVarSubMean.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/preprocessor/PruneVarSubMean.h>
00013 #include <shogun/preprocessor/SimplePreprocessor.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/io/SGIO.h>
00017 #include <shogun/mathematics/Math.h>
00018 
00019 using namespace shogun;
00020 
00021 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00022 : CSimplePreprocessor<float64_t>(), idx(NULL), mean(NULL),
00023     std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
00024 {
00025 }
00026 
00027 CPruneVarSubMean::~CPruneVarSubMean()
00028 {
00029     cleanup();
00030 }
00031 
00033 bool CPruneVarSubMean::init(CFeatures* features)
00034 {
00035     if (!initialized)
00036     {
00037         ASSERT(features->get_feature_class()==C_SIMPLE);
00038         ASSERT(features->get_feature_type()==F_DREAL);
00039 
00040         CSimpleFeatures<float64_t>* simple_features=(CSimpleFeatures<float64_t>*) features;
00041         int32_t num_examples = simple_features->get_num_vectors();
00042         int32_t num_features = simple_features->get_num_features();
00043 
00044         SG_FREE(mean);
00045         SG_FREE(idx);
00046         SG_FREE(std);
00047         mean=NULL;
00048         idx=NULL;
00049         std=NULL;
00050 
00051         mean=SG_MALLOC(float64_t, num_features);
00052         float64_t* var=SG_MALLOC(float64_t, num_features);
00053         int32_t i,j;
00054 
00055         for (i=0; i<num_features; i++)
00056         {
00057             mean[i]=0;
00058             var[i]=0 ;
00059         }
00060 
00061         SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00062 
00063         // compute mean
00064         for (i=0; i<num_examples; i++)
00065         {
00066             for (j=0; j<num_features; j++)
00067                 mean[j]+=feature_matrix.matrix[i*num_features+j];
00068         }
00069 
00070         for (j=0; j<num_features; j++)
00071             mean[j]/=num_examples;
00072 
00073         // compute var
00074         for (i=0; i<num_examples; i++)
00075         {
00076             for (j=0; j<num_features; j++)
00077                 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]);
00078         }
00079 
00080         int32_t num_ok=0;
00081         int32_t* idx_ok=SG_MALLOC(int, num_features);
00082 
00083         for (j=0; j<num_features; j++)
00084         {
00085             var[j]/=num_examples;
00086 
00087             if (var[j]>=1e-14) 
00088             {
00089                 idx_ok[num_ok]=j;
00090                 num_ok++ ;
00091             }
00092         }
00093 
00094         SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
00095 
00096         SG_FREE(idx);
00097         idx=SG_MALLOC(int, num_ok);
00098         float64_t* new_mean=SG_MALLOC(float64_t, num_ok);
00099         std=SG_MALLOC(float64_t, num_ok);
00100 
00101         for (j=0; j<num_ok; j++)
00102         {
00103             idx[j]=idx_ok[j] ;
00104             new_mean[j]=mean[idx_ok[j]];
00105             std[j]=sqrt(var[idx_ok[j]]);
00106         }
00107         num_idx = num_ok ;
00108         SG_FREE(idx_ok);
00109         SG_FREE(mean);
00110         SG_FREE(var);
00111         mean = new_mean;
00112 
00113         initialized = true;
00114         return true;
00115     }
00116     else
00117         return false;
00118 }
00119 
00121 void CPruneVarSubMean::cleanup()
00122 {
00123     SG_FREE(idx);
00124     idx=NULL;
00125     SG_FREE(mean);
00126     mean=NULL;
00127     SG_FREE(std);
00128     std=NULL;
00129 }
00130 
00134 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features)
00135 {
00136     ASSERT(initialized);
00137 
00138     int32_t num_vectors=0;
00139     int32_t num_features=0;
00140     float64_t* m=((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00141 
00142     SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
00143     SG_INFO( "Preprocessing feature matrix\n");
00144     for (int32_t vec=0; vec<num_vectors; vec++)
00145     {
00146         float64_t* v_src=&m[num_features*vec];
00147         float64_t* v_dst=&m[num_idx*vec];
00148 
00149         if (divide_by_std)
00150         {
00151             for (int32_t feat=0; feat<num_idx; feat++)
00152                 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
00153         }
00154         else
00155         {
00156             for (int32_t feat=0; feat<num_idx; feat++)
00157                 v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
00158         }
00159     }
00160 
00161     ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_idx);
00162     ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00163     SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
00164 
00165     return ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix();
00166 }
00167 
00170 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector)
00171 {
00172     float64_t* ret=NULL;
00173 
00174     if (initialized)
00175     {
00176         ret=SG_MALLOC(float64_t, num_idx);
00177 
00178         if (divide_by_std)
00179         {
00180             for (int32_t i=0; i<num_idx; i++)
00181                 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i];
00182         }
00183         else
00184         {
00185             for (int32_t i=0; i<num_idx; i++)
00186                 ret[i]=(vector.vector[idx[i]]-mean[i]);
00187         }
00188     }
00189     else
00190     {
00191         ret=SG_MALLOC(float64_t, vector.vlen);
00192         for (int32_t i=0; i<vector.vlen; i++)
00193             ret[i]=vector.vector[i];
00194     }
00195 
00196     return SGVector<float64_t>(ret,num_idx);
00197 }