SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
PruneVarSubMean.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
15 #include <shogun/io/SGIO.h>
17 
18 using namespace shogun;
19 
21 : CDensePreprocessor<float64_t>(), idx(NULL), mean(NULL),
22  std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
23 {
24 }
25 
27 {
28  cleanup();
29 }
30 
32 bool CPruneVarSubMean::init(CFeatures* features)
33 {
34  if (!initialized)
35  {
36  ASSERT(features->get_feature_class()==C_DENSE);
37  ASSERT(features->get_feature_type()==F_DREAL);
38 
39  CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features;
40  int32_t num_examples = simple_features->get_num_vectors();
41  int32_t num_features = simple_features->get_num_features();
42 
43  SG_FREE(mean);
44  SG_FREE(idx);
45  SG_FREE(std);
46  mean=NULL;
47  idx=NULL;
48  std=NULL;
49 
50  mean=SG_MALLOC(float64_t, num_features);
51  float64_t* var=SG_MALLOC(float64_t, num_features);
52  int32_t i,j;
53 
54  for (i=0; i<num_features; i++)
55  {
56  mean[i]=0;
57  var[i]=0 ;
58  }
59 
60  SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
61 
62  // compute mean
63  for (i=0; i<num_examples; i++)
64  {
65  for (j=0; j<num_features; j++)
66  mean[j]+=feature_matrix.matrix[i*num_features+j];
67  }
68 
69  for (j=0; j<num_features; j++)
70  mean[j]/=num_examples;
71 
72  // compute var
73  for (i=0; i<num_examples; i++)
74  {
75  for (j=0; j<num_features; j++)
76  var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]);
77  }
78 
79  int32_t num_ok=0;
80  int32_t* idx_ok=SG_MALLOC(int, num_features);
81 
82  for (j=0; j<num_features; j++)
83  {
84  var[j]/=num_examples;
85 
86  if (var[j]>=1e-14)
87  {
88  idx_ok[num_ok]=j;
89  num_ok++ ;
90  }
91  }
92 
93  SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
94 
95  SG_FREE(idx);
96  idx=SG_MALLOC(int, num_ok);
97  float64_t* new_mean=SG_MALLOC(float64_t, num_ok);
98  std=SG_MALLOC(float64_t, num_ok);
99 
100  for (j=0; j<num_ok; j++)
101  {
102  idx[j]=idx_ok[j] ;
103  new_mean[j]=mean[idx_ok[j]];
104  std[j]=sqrt(var[idx_ok[j]]);
105  }
106  num_idx = num_ok ;
107  SG_FREE(idx_ok);
108  SG_FREE(mean);
109  SG_FREE(var);
110  mean = new_mean;
111 
112  initialized = true;
113  return true;
114  }
115  else
116  return false;
117 }
118 
121 {
122  SG_FREE(idx);
123  idx=NULL;
124  SG_FREE(mean);
125  mean=NULL;
126  SG_FREE(std);
127  std=NULL;
128 }
129 
134 {
136 
137  int32_t num_vectors=0;
138  int32_t num_features=0;
139  float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
140 
141  SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
142  SG_INFO( "Preprocessing feature matrix\n");
143  for (int32_t vec=0; vec<num_vectors; vec++)
144  {
145  float64_t* v_src=&m[num_features*vec];
146  float64_t* v_dst=&m[num_idx*vec];
147 
148  if (divide_by_std)
149  {
150  for (int32_t feat=0; feat<num_idx; feat++)
151  v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
152  }
153  else
154  {
155  for (int32_t feat=0; feat<num_idx; feat++)
156  v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
157  }
158  }
159 
160  ((CDenseFeatures<float64_t>*) features)->set_num_features(num_idx);
161  ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
162  SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
163 
164  return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
165 }
166 
170 {
171  float64_t* ret=NULL;
172 
173  if (initialized)
174  {
176 
177  if (divide_by_std)
178  {
179  for (int32_t i=0; i<num_idx; i++)
180  ret[i]=(vector.vector[idx[i]]-mean[i])/std[i];
181  }
182  else
183  {
184  for (int32_t i=0; i<num_idx; i++)
185  ret[i]=(vector.vector[idx[i]]-mean[i]);
186  }
187  }
188  else
189  {
190  ret=SG_MALLOC(float64_t, vector.vlen);
191  for (int32_t i=0; i<vector.vlen; i++)
192  ret[i]=vector.vector[i];
193  }
194 
195  return SGVector<float64_t>(ret,num_idx);
196 }

SHOGUN Machine Learning Toolbox - Documentation