SHOGUN: PruneVarSubMean.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/preprocessor/PruneVarSubMean.h>
00013 #include <shogun/preprocessor/DensePreprocessor.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017 
00018 using namespace shogun;
00019 
00020 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00021 : CDensePreprocessor<float64_t>(), idx(NULL), mean(NULL),
00022     std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
00023 {
00024 }
00025 
00026 CPruneVarSubMean::~CPruneVarSubMean()
00027 {
00028     cleanup();
00029 }
00030 
00032 bool CPruneVarSubMean::init(CFeatures* features)
00033 {
00034     if (!initialized)
00035     {
00036         ASSERT(features->get_feature_class()==C_DENSE);
00037         ASSERT(features->get_feature_type()==F_DREAL);
00038 
00039         CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features;
00040         int32_t num_examples = simple_features->get_num_vectors();
00041         int32_t num_features = simple_features->get_num_features();
00042 
00043         SG_FREE(mean);
00044         SG_FREE(idx);
00045         SG_FREE(std);
00046         mean=NULL;
00047         idx=NULL;
00048         std=NULL;
00049 
00050         mean=SG_MALLOC(float64_t, num_features);
00051         float64_t* var=SG_MALLOC(float64_t, num_features);
00052         int32_t i,j;
00053 
00054         for (i=0; i<num_features; i++)
00055         {
00056             mean[i]=0;
00057             var[i]=0 ;
00058         }
00059 
00060         SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00061 
00062         // compute mean
00063         for (i=0; i<num_examples; i++)
00064         {
00065             for (j=0; j<num_features; j++)
00066                 mean[j]+=feature_matrix.matrix[i*num_features+j];
00067         }
00068 
00069         for (j=0; j<num_features; j++)
00070             mean[j]/=num_examples;
00071 
00072         // compute var
00073         for (i=0; i<num_examples; i++)
00074         {
00075             for (j=0; j<num_features; j++)
00076                 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]);
00077         }
00078 
00079         int32_t num_ok=0;
00080         int32_t* idx_ok=SG_MALLOC(int, num_features);
00081 
00082         for (j=0; j<num_features; j++)
00083         {
00084             var[j]/=num_examples;
00085 
00086             if (var[j]>=1e-14)
00087             {
00088                 idx_ok[num_ok]=j;
00089                 num_ok++ ;
00090             }
00091         }
00092 
00093         SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
00094 
00095         SG_FREE(idx);
00096         idx=SG_MALLOC(int, num_ok);
00097         float64_t* new_mean=SG_MALLOC(float64_t, num_ok);
00098         std=SG_MALLOC(float64_t, num_ok);
00099 
00100         for (j=0; j<num_ok; j++)
00101         {
00102             idx[j]=idx_ok[j] ;
00103             new_mean[j]=mean[idx_ok[j]];
00104             std[j]=sqrt(var[idx_ok[j]]);
00105         }
00106         num_idx = num_ok ;
00107         SG_FREE(idx_ok);
00108         SG_FREE(mean);
00109         SG_FREE(var);
00110         mean = new_mean;
00111 
00112         initialized = true;
00113         return true;
00114     }
00115     else
00116         return false;
00117 }
00118 
00120 void CPruneVarSubMean::cleanup()
00121 {
00122     SG_FREE(idx);
00123     idx=NULL;
00124     SG_FREE(mean);
00125     mean=NULL;
00126     SG_FREE(std);
00127     std=NULL;
00128 }
00129 
00133 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features)
00134 {
00135     ASSERT(initialized);
00136 
00137     int32_t num_vectors=0;
00138     int32_t num_features=0;
00139     float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00140 
00141     SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
00142     SG_INFO( "Preprocessing feature matrix\n");
00143     for (int32_t vec=0; vec<num_vectors; vec++)
00144     {
00145         float64_t* v_src=&m[num_features*vec];
00146         float64_t* v_dst=&m[num_idx*vec];
00147 
00148         if (divide_by_std)
00149         {
00150             for (int32_t feat=0; feat<num_idx; feat++)
00151                 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
00152         }
00153         else
00154         {
00155             for (int32_t feat=0; feat<num_idx; feat++)
00156                 v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
00157         }
00158     }
00159 
00160     ((CDenseFeatures<float64_t>*) features)->set_num_features(num_idx);
00161     ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00162     SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
00163 
00164     return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
00165 }
00166 
00169 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector)
00170 {
00171     float64_t* ret=NULL;
00172 
00173     if (initialized)
00174     {
00175         ret=SG_MALLOC(float64_t, num_idx);
00176 
00177         if (divide_by_std)
00178         {
00179             for (int32_t i=0; i<num_idx; i++)
00180                 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i];
00181         }
00182         else
00183         {
00184             for (int32_t i=0; i<num_idx; i++)
00185                 ret[i]=(vector.vector[idx[i]]-mean[i]);
00186         }
00187     }
00188     else
00189     {
00190         ret=SG_MALLOC(float64_t, vector.vlen);
00191         for (int32_t i=0; i<vector.vlen; i++)
00192             ret[i]=vector.vector[i];
00193     }
00194 
00195     return SGVector<float64_t>(ret,num_idx);
00196 }