Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/preprocessor/PruneVarSubMean.h>
00013 #include <shogun/preprocessor/SimplePreprocessor.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/features/SimpleFeatures.h>
00016 #include <shogun/io/SGIO.h>
00017 #include <shogun/mathematics/Math.h>
00018
00019 using namespace shogun;
00020
00021 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00022 : CSimplePreprocessor<float64_t>(), idx(NULL), mean(NULL),
00023 std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
00024 {
00025 }
00026
00027 CPruneVarSubMean::~CPruneVarSubMean()
00028 {
00029 cleanup();
00030 }
00031
00033 bool CPruneVarSubMean::init(CFeatures* features)
00034 {
00035 if (!initialized)
00036 {
00037 ASSERT(features->get_feature_class()==C_SIMPLE);
00038 ASSERT(features->get_feature_type()==F_DREAL);
00039
00040 CSimpleFeatures<float64_t>* simple_features=(CSimpleFeatures<float64_t>*) features;
00041 int32_t num_examples = simple_features->get_num_vectors();
00042 int32_t num_features = simple_features->get_num_features();
00043
00044 SG_FREE(mean);
00045 SG_FREE(idx);
00046 SG_FREE(std);
00047 mean=NULL;
00048 idx=NULL;
00049 std=NULL;
00050
00051 mean=SG_MALLOC(float64_t, num_features);
00052 float64_t* var=SG_MALLOC(float64_t, num_features);
00053 int32_t i,j;
00054
00055 for (i=0; i<num_features; i++)
00056 {
00057 mean[i]=0;
00058 var[i]=0 ;
00059 }
00060
00061 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00062
00063
00064 for (i=0; i<num_examples; i++)
00065 {
00066 for (j=0; j<num_features; j++)
00067 mean[j]+=feature_matrix.matrix[i*num_features+j];
00068 }
00069
00070 for (j=0; j<num_features; j++)
00071 mean[j]/=num_examples;
00072
00073
00074 for (i=0; i<num_examples; i++)
00075 {
00076 for (j=0; j<num_features; j++)
00077 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]);
00078 }
00079
00080 int32_t num_ok=0;
00081 int32_t* idx_ok=SG_MALLOC(int, num_features);
00082
00083 for (j=0; j<num_features; j++)
00084 {
00085 var[j]/=num_examples;
00086
00087 if (var[j]>=1e-14)
00088 {
00089 idx_ok[num_ok]=j;
00090 num_ok++ ;
00091 }
00092 }
00093
00094 SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
00095
00096 SG_FREE(idx);
00097 idx=SG_MALLOC(int, num_ok);
00098 float64_t* new_mean=SG_MALLOC(float64_t, num_ok);
00099 std=SG_MALLOC(float64_t, num_ok);
00100
00101 for (j=0; j<num_ok; j++)
00102 {
00103 idx[j]=idx_ok[j] ;
00104 new_mean[j]=mean[idx_ok[j]];
00105 std[j]=sqrt(var[idx_ok[j]]);
00106 }
00107 num_idx = num_ok ;
00108 SG_FREE(idx_ok);
00109 SG_FREE(mean);
00110 SG_FREE(var);
00111 mean = new_mean;
00112
00113 initialized = true;
00114 return true;
00115 }
00116 else
00117 return false;
00118 }
00119
00121 void CPruneVarSubMean::cleanup()
00122 {
00123 SG_FREE(idx);
00124 idx=NULL;
00125 SG_FREE(mean);
00126 mean=NULL;
00127 SG_FREE(std);
00128 std=NULL;
00129 }
00130
00134 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features)
00135 {
00136 ASSERT(initialized);
00137
00138 int32_t num_vectors=0;
00139 int32_t num_features=0;
00140 float64_t* m=((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00141
00142 SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
00143 SG_INFO( "Preprocessing feature matrix\n");
00144 for (int32_t vec=0; vec<num_vectors; vec++)
00145 {
00146 float64_t* v_src=&m[num_features*vec];
00147 float64_t* v_dst=&m[num_idx*vec];
00148
00149 if (divide_by_std)
00150 {
00151 for (int32_t feat=0; feat<num_idx; feat++)
00152 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
00153 }
00154 else
00155 {
00156 for (int32_t feat=0; feat<num_idx; feat++)
00157 v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
00158 }
00159 }
00160
00161 ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_idx);
00162 ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00163 SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
00164
00165 return ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix();
00166 }
00167
00170 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector)
00171 {
00172 float64_t* ret=NULL;
00173
00174 if (initialized)
00175 {
00176 ret=SG_MALLOC(float64_t, num_idx);
00177
00178 if (divide_by_std)
00179 {
00180 for (int32_t i=0; i<num_idx; i++)
00181 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i];
00182 }
00183 else
00184 {
00185 for (int32_t i=0; i<num_idx; i++)
00186 ret[i]=(vector.vector[idx[i]]-mean[i]);
00187 }
00188 }
00189 else
00190 {
00191 ret=SG_MALLOC(float64_t, vector.vlen);
00192 for (int32_t i=0; i<vector.vlen; i++)
00193 ret[i]=vector.vector[i];
00194 }
00195
00196 return SGVector<float64_t>(ret,num_idx);
00197 }