SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
PruneVarSubMean.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
15 #include <shogun/io/SGIO.h>
17 
18 using namespace shogun;
19 
22 {
23  init();
24  register_parameters();
25  m_divide_by_std = divide;
26 }
27 
29 {
30  cleanup();
31 }
32 
34 bool CPruneVarSubMean::init(CFeatures* features)
35 {
36  if (!m_initialized)
37  {
38  ASSERT(features->get_feature_class()==C_DENSE)
39  ASSERT(features->get_feature_type()==F_DREAL)
40 
41  CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features;
42  int32_t num_examples = simple_features->get_num_vectors();
43  int32_t num_features = simple_features->get_num_features();
44 
48 
49  m_mean.resize_vector(num_features);
50  float64_t* var=SG_MALLOC(float64_t, num_features);
51  int32_t i,j;
52 
53  memset(var, 0, num_features*sizeof(float64_t));
54  m_mean.zero();
55 
56  SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
57 
58  // compute mean
59  for (i=0; i<num_examples; i++)
60  {
61  for (j=0; j<num_features; j++)
62  m_mean[j]+=feature_matrix.matrix[i*num_features+j];
63  }
64 
65  for (j=0; j<num_features; j++)
66  m_mean[j]/=num_examples;
67 
68  // compute var
69  for (i=0; i<num_examples; i++)
70  {
71  for (j=0; j<num_features; j++)
72  var[j]+=CMath::sq(m_mean[j]-feature_matrix.matrix[i*num_features+j]);
73  }
74 
75  int32_t num_ok=0;
76  int32_t* idx_ok=SG_MALLOC(int32_t, num_features);
77 
78  for (j=0; j<num_features; j++)
79  {
80  var[j]/=num_examples;
81 
82  if (var[j]>=1e-14)
83  {
84  idx_ok[num_ok]=j;
85  num_ok++ ;
86  }
87  }
88 
89  SG_INFO("Reducing number of features from %i to %i\n", num_features, num_ok)
90 
91  m_idx.resize_vector(num_ok);
92  SGVector<float64_t> new_mean(num_ok);
93  m_std.resize_vector(num_ok);
94 
95  for (j=0; j<num_ok; j++)
96  {
97  m_idx[j]=idx_ok[j] ;
98  new_mean[j]=m_mean[idx_ok[j]];
99  m_std[j]=CMath::sqrt(var[idx_ok[j]]);
100  }
101  m_num_idx = num_ok;
102  SG_FREE(idx_ok);
103  SG_FREE(var);
104  m_mean = new_mean;
105 
106  m_initialized = true;
107  return true;
108  }
109  else
110  return false;
111 }
112 
115 {
119  m_initialized = false;
120 }
121 
126 {
128 
129  int32_t num_vectors=0;
130  int32_t num_features=0;
131  float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
132 
133  SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features)
134  SG_INFO("Preprocessing feature matrix\n")
135  for (int32_t vec=0; vec<num_vectors; vec++)
136  {
137  float64_t* v_src=&m[num_features*vec];
138  float64_t* v_dst=&m[m_num_idx*vec];
139 
140  if (m_divide_by_std)
141  {
142  for (int32_t feat=0; feat<m_num_idx; feat++)
143  v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat])/m_std[feat];
144  }
145  else
146  {
147  for (int32_t feat=0; feat<m_num_idx; feat++)
148  v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat]);
149  }
150  }
151 
152  ((CDenseFeatures<float64_t>*) features)->set_num_features(m_num_idx);
153  ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
154  SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features)
155 
156  return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
157 }
158 
162 {
163  float64_t* ret=NULL;
164 
165  if (m_initialized)
166  {
167  ret=SG_MALLOC(float64_t, m_num_idx);
168 
169  if (m_divide_by_std)
170  {
171  for (int32_t i=0; i<m_num_idx; i++)
172  ret[i]=(vector.vector[m_idx[i]]-m_mean[i])/m_std[i];
173  }
174  else
175  {
176  for (int32_t i=0; i<m_num_idx; i++)
177  ret[i]=(vector.vector[m_idx[i]]-m_mean[i]);
178  }
179  }
180  else
181  {
182  ret=SG_MALLOC(float64_t, vector.vlen);
183  for (int32_t i=0; i<vector.vlen; i++)
184  ret[i]=vector.vector[i];
185  }
186 
187  return SGVector<float64_t>(ret,m_num_idx);
188 }
189 
190 void CPruneVarSubMean::init()
191 {
192  m_initialized = false;
193  m_divide_by_std = false;
194  m_num_idx = 0;
198 }
199 
200 void CPruneVarSubMean::register_parameters()
201 {
202  SG_ADD(&m_initialized, "initialized", "The prerpocessor is initialized", MS_NOT_AVAILABLE);
203  SG_ADD(&m_divide_by_std, "divide_by_std", "Divide by standard deviation", MS_AVAILABLE);
204  SG_ADD(&m_num_idx, "num_idx", "Number of elements in idx_vec", MS_NOT_AVAILABLE);
205  SG_ADD(&m_std, "std_vec", "Standard dev vector", MS_NOT_AVAILABLE);
206  SG_ADD(&m_mean, "mean_vec", "Mean vector", MS_NOT_AVAILABLE);
207  SG_ADD(&m_idx, "idx_vec", "Index vector", MS_NOT_AVAILABLE);
208 }

SHOGUN Machine Learning Toolbox - Documentation