Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <shogun/preprocessor/PruneVarSubMean.h>
00013 #include <shogun/preprocessor/DensePreprocessor.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017
00018 using namespace shogun;
00019
00020 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00021 : CDensePreprocessor<float64_t>(), idx(NULL), mean(NULL),
00022 std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
00023 {
00024 }
00025
00026 CPruneVarSubMean::~CPruneVarSubMean()
00027 {
00028 cleanup();
00029 }
00030
00032 bool CPruneVarSubMean::init(CFeatures* features)
00033 {
00034 if (!initialized)
00035 {
00036 ASSERT(features->get_feature_class()==C_DENSE);
00037 ASSERT(features->get_feature_type()==F_DREAL);
00038
00039 CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features;
00040 int32_t num_examples = simple_features->get_num_vectors();
00041 int32_t num_features = simple_features->get_num_features();
00042
00043 SG_FREE(mean);
00044 SG_FREE(idx);
00045 SG_FREE(std);
00046 mean=NULL;
00047 idx=NULL;
00048 std=NULL;
00049
00050 mean=SG_MALLOC(float64_t, num_features);
00051 float64_t* var=SG_MALLOC(float64_t, num_features);
00052 int32_t i,j;
00053
00054 for (i=0; i<num_features; i++)
00055 {
00056 mean[i]=0;
00057 var[i]=0 ;
00058 }
00059
00060 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00061
00062
00063 for (i=0; i<num_examples; i++)
00064 {
00065 for (j=0; j<num_features; j++)
00066 mean[j]+=feature_matrix.matrix[i*num_features+j];
00067 }
00068
00069 for (j=0; j<num_features; j++)
00070 mean[j]/=num_examples;
00071
00072
00073 for (i=0; i<num_examples; i++)
00074 {
00075 for (j=0; j<num_features; j++)
00076 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]);
00077 }
00078
00079 int32_t num_ok=0;
00080 int32_t* idx_ok=SG_MALLOC(int, num_features);
00081
00082 for (j=0; j<num_features; j++)
00083 {
00084 var[j]/=num_examples;
00085
00086 if (var[j]>=1e-14)
00087 {
00088 idx_ok[num_ok]=j;
00089 num_ok++ ;
00090 }
00091 }
00092
00093 SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
00094
00095 SG_FREE(idx);
00096 idx=SG_MALLOC(int, num_ok);
00097 float64_t* new_mean=SG_MALLOC(float64_t, num_ok);
00098 std=SG_MALLOC(float64_t, num_ok);
00099
00100 for (j=0; j<num_ok; j++)
00101 {
00102 idx[j]=idx_ok[j] ;
00103 new_mean[j]=mean[idx_ok[j]];
00104 std[j]=sqrt(var[idx_ok[j]]);
00105 }
00106 num_idx = num_ok ;
00107 SG_FREE(idx_ok);
00108 SG_FREE(mean);
00109 SG_FREE(var);
00110 mean = new_mean;
00111
00112 initialized = true;
00113 return true;
00114 }
00115 else
00116 return false;
00117 }
00118
00120 void CPruneVarSubMean::cleanup()
00121 {
00122 SG_FREE(idx);
00123 idx=NULL;
00124 SG_FREE(mean);
00125 mean=NULL;
00126 SG_FREE(std);
00127 std=NULL;
00128 }
00129
00133 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features)
00134 {
00135 ASSERT(initialized);
00136
00137 int32_t num_vectors=0;
00138 int32_t num_features=0;
00139 float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00140
00141 SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
00142 SG_INFO( "Preprocessing feature matrix\n");
00143 for (int32_t vec=0; vec<num_vectors; vec++)
00144 {
00145 float64_t* v_src=&m[num_features*vec];
00146 float64_t* v_dst=&m[num_idx*vec];
00147
00148 if (divide_by_std)
00149 {
00150 for (int32_t feat=0; feat<num_idx; feat++)
00151 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
00152 }
00153 else
00154 {
00155 for (int32_t feat=0; feat<num_idx; feat++)
00156 v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
00157 }
00158 }
00159
00160 ((CDenseFeatures<float64_t>*) features)->set_num_features(num_idx);
00161 ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00162 SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
00163
00164 return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
00165 }
00166
00169 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector)
00170 {
00171 float64_t* ret=NULL;
00172
00173 if (initialized)
00174 {
00175 ret=SG_MALLOC(float64_t, num_idx);
00176
00177 if (divide_by_std)
00178 {
00179 for (int32_t i=0; i<num_idx; i++)
00180 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i];
00181 }
00182 else
00183 {
00184 for (int32_t i=0; i<num_idx; i++)
00185 ret[i]=(vector.vector[idx[i]]-mean[i]);
00186 }
00187 }
00188 else
00189 {
00190 ret=SG_MALLOC(float64_t, vector.vlen);
00191 for (int32_t i=0; i<vector.vlen; i++)
00192 ret[i]=vector.vector[i];
00193 }
00194
00195 return SGVector<float64_t>(ret,num_idx);
00196 }