SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
FeatureSelection.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Soumyajit De
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #include <shogun/labels/Labels.h>
38 
39 namespace shogun
40 {
41 
42 template <class ST>
44 {
45  initialize_parameters();
46 }
47 
48 template <class ST>
50 {
51  SG_ADD(&m_target_dim, "target_dim", "target dimension",
53  SG_ADD((machine_int_t*)&m_algorithm, "algorithm",
54  "the feature selectiona algorithm", MS_NOT_AVAILABLE);
55  SG_ADD((machine_int_t*)&m_policy, "policy", "feature removal policy",
57  SG_ADD(&m_num_remove, "num_remove", "number or percentage of features to "
58  "be removed", MS_NOT_AVAILABLE);
59  SG_ADD((CSGObject**)&m_labels, "labels",
60  "the class labels for the features", MS_NOT_AVAILABLE);
61  SG_ADD((CSGObject**)&m_subset, "subset",
62  "indices of selected features", MS_NOT_AVAILABLE);
63 
64  m_target_dim=0;
65  m_algorithm=BACKWARD_ELIMINATION;
66  m_policy=N_LARGEST;
67  m_num_remove=1;
68  m_labels=NULL;
69  m_subset=new CSubsetStack();
70 }
71 
72 template <class ST>
74 {
75  SG_UNREF(m_labels);
76  SG_UNREF(m_subset);
77 }
78 
79 
80 template <class ST>
82 {
83  m_subset->remove_all_subsets();
84 }
85 
86 template <class ST>
88 {
89  SG_DEBUG("Entering!\n");
90 
91  // precompute whenever appropriate for performing the rest of the tasks
92  precompute();
93 
94  // NULL check for features is handled in get_num_features
95  index_t num_features=get_num_features(features);
96  SG_DEBUG("Initial number of features %d!\n", num_features);
97 
98  // the main loop
99  while (num_features>m_target_dim)
100  {
101  // tune the measurement parameters whenever necessary based on current
102  // features
103  adapt_params(features);
104 
105  // compute the measures for each of the current dimensions
106  SGVector<float64_t> measures(num_features);
107  for (index_t i=0; i<num_features; ++i)
108  measures[i]=compute_measures(features, i);
109 
110  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
111  measures.display_vector("measures");
112 
113  // rank the measures
114  SGVector<index_t> argsorted=CMath::argsort(measures);
115 
116  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
117  argsorted.display_vector("argsorted");
118 
119  // make sure that we don't end up with lesser feats than target dim
120  index_t to_remove;
121  if (m_policy==N_SMALLEST || m_policy==N_LARGEST)
122  to_remove=m_num_remove;
123  else
124  to_remove=num_features*m_num_remove*0.01;
125 
126  index_t can_remove=num_features-m_target_dim;
127 
128  // if policy is to remove N feats corresponding to smallest/largest
129  // measures, we just replace N with can_remove. if policy is to remove
130  // N% feats, then we change the policy temporarily and remove a fixed
131  // can_remove number of feats instead
132  index_t orig_remove=m_num_remove;
133  EFeatureRemovalPolicy orig_policy=m_policy;
134 
135  if (to_remove>can_remove)
136  {
137  m_num_remove=can_remove;
138  SG_DEBUG("Can only remove %d features in this iteration!\n",
139  can_remove);
140 
141  if (m_policy==PERCENTILE_SMALLEST)
142  m_policy=N_SMALLEST;
143  else if (m_policy==PERCENTILE_LARGEST)
144  m_policy=N_LARGEST;
145  }
146 
147  // remove appropriate number of features based on the measures and the
148  // removal policy. this internally update the subset for selected
149  // features as well
150  features=remove_feats(features, argsorted);
151 
152  // restore original removal policy and numbers if necessary for the
153  // sake of consistency
154  if (to_remove>can_remove)
155  {
156  m_policy=orig_policy;
157  m_num_remove=orig_remove;
158  }
159 
160  // update the number of features
161  num_features=get_num_features(features);
162  SG_DEBUG("Current number of features %d!\n", num_features);
163  }
164 
165  // sanity check
166  ASSERT(m_subset->get_size()==m_target_dim);
167 
168  SG_DEBUG("Leaving!\n");
169  return features;
170 }
171 
172 template <class ST>
174 {
175  SG_DEBUG("Entering!\n");
176 
177  // remove previously computed feature subsets
178  m_subset->remove_all_subsets();
179 
180  // sanity checks
181  REQUIRE(features, "Features cannot be NULL!\n");
182  REQUIRE(features->get_num_vectors()>0,
183  "Number of feature vectors has to be positive!\n");
184  REQUIRE(m_target_dim>0, "Target dimension (%d) has to be positive! Set "
185  "a higher number via set_target_dim().\n", m_target_dim);
186 
187  index_t num_features=get_num_features(features);
188  REQUIRE(num_features>0, "Invalid number of features (%d)! Most likely "
189  "feature selection cannot be performed for %s!\n",
190  num_features, features->get_name());
191  REQUIRE(num_features>m_target_dim,
192  "Number of original features (dimensions of the feature vectors) "
193  "(%d) has to be greater that the target dimension (%d)!\n",
194  num_features, m_target_dim);
195 
196  // this method makes a deep copy of the feature object and performs
197  // feature selection on it. This is already SG_REF'ed because of the
198  // implementation of clone()
199  CFeatures* feats_copy=(CFeatures*)features->clone();
200 
201  switch (m_algorithm)
202  {
204  return apply_backward_elimination(feats_copy);
205  default:
206  SG_ERROR("Specified algorithm not yet supported!\n");
207  return features;
208  }
209 
210  SG_DEBUG("Leaving!\n");
211 }
212 
213 template <class ST>
215 {
216 }
217 
218 template <class ST>
220 {
221 }
222 
223 template <class ST>
225 {
226  ASSERT(m_subset);
227 
228  SGVector<index_t> inds;
229  if (m_subset->has_subsets())
230  {
231  inds=SGVector<index_t>(m_subset->get_size());
232  for (index_t i=0; i<inds.vlen; ++i)
233  inds[i]=m_subset->subset_idx_conversion(i);
234  CMath::qsort(inds);
235  }
236 
237  return inds;
238 }
239 
240 template <class ST>
242 {
243  REQUIRE(features, "Features not initialized!\n");
244 
245  EFeatureClass f_class=features->get_feature_class();
246 
247  switch (f_class)
248  {
249  case C_DENSE:
250  {
251  CDenseFeatures<ST>* d_feats=dynamic_cast<CDenseFeatures<ST>*>(features);
252  REQUIRE(d_feats, "Type mismatch for dense features!\n");
253  return d_feats->get_num_features();
254  }
255  case C_SPARSE:
256  {
257  CSparseFeatures<ST>* s_feats=dynamic_cast<CSparseFeatures<ST>*>(features);
258  REQUIRE(s_feats, "Type mismatch for sparse features!\n");
259  return s_feats->get_num_features();
260  }
261  default:
262  SG_ERROR("Number of features not available for %s!\n",
263  features->get_name());
264  break;
265  }
266 
267  return 0;
268 }
269 
270 template <class ST>
272 {
273  m_target_dim=target_dim;
274 }
275 
276 template <class ST>
278 {
279  return m_target_dim;
280 }
281 
282 template <class ST>
284 {
285  return m_algorithm;
286 }
287 
288 template <class ST>
290 {
291  return m_policy;
292 }
293 
294 template <class ST>
296 {
297  m_num_remove=num_remove;
298 }
299 
300 template <class ST>
302 {
303  return m_num_remove;
304 }
305 
306 template <class ST>
308 {
309  SG_REF(labels);
310  SG_UNREF(m_labels);
311  m_labels=labels;
312 }
313 
314 template <class ST>
316 {
317  SG_REF(m_labels);
318  return m_labels;
319 }
320 
321 template <class ST>
323 {
324  return C_ANY;
325 }
326 
327 template <class ST>
329 {
330  return P_UNKNOWN;
331 }
332 
333 template<>
335 {
336  return F_LONGREAL;
337 }
338 
339 template<>
341 {
342  return F_DREAL;
343 }
344 
345 template<>
347 {
348  return F_SHORTREAL;
349 }
350 
351 template<>
353 {
354  return F_SHORT;
355 }
356 
357 template<>
359 {
360  return F_WORD;
361 }
362 
363 template<>
365 {
366  return F_CHAR;
367 }
368 
369 template<>
371 {
372  return F_CHAR;
373 }
374 
375 template<>
377 {
378  return F_BYTE;
379 }
380 
381 template<>
383 {
384  return F_INT;
385 }
386 
387 template<>
389 {
390  return F_UINT;
391 }
392 
393 template<>
395 {
396  return F_LONG;
397 }
398 
399 template<>
401 {
402  return F_ULONG;
403 }
404 
405 template<>
407 {
408  return F_BOOL;
409 }
410 
411 template class CFeatureSelection<bool>;
412 template class CFeatureSelection<char>;
413 template class CFeatureSelection<int8_t>;
414 template class CFeatureSelection<uint8_t>;
415 template class CFeatureSelection<int16_t>;
416 template class CFeatureSelection<uint16_t>;
417 template class CFeatureSelection<int32_t>;
418 template class CFeatureSelection<uint32_t>;
419 template class CFeatureSelection<int64_t>;
420 template class CFeatureSelection<uint64_t>;
421 template class CFeatureSelection<float32_t>;
422 template class CFeatureSelection<float64_t>;
423 template class CFeatureSelection<floatmax_t>;
424 
425 }
virtual const char * get_name() const =0
virtual void adapt_params(CFeatures *features)
EPreprocessorType
Definition: Preprocessor.h:32
The class DenseFeatures implements dense feature matrices.
Definition: LDA.h:41
int32_t get_num_features() const
SGVector< index_t > get_selected_feats()
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:714
Template class SparseFeatures implements sparse matrices.
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
virtual EPreprocessorType get_type() const
#define SG_REF(x)
Definition: SGObject.h:51
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
class to add subset support to another class. A CSubsetStackStack instance should be added and wrappe...
Definition: SubsetStack.h:37
Template class CFeatureSelection, base class for all feature selection preprocessors which select a s...
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
void display_vector(const char *name="vector", const char *prefix="") const
Definition: SGVector.cpp:356
void set_num_remove(index_t num_remove)
EFeatureSelectionAlgorithm get_algorithm() const
virtual void set_labels(CLabels *labels)
index_t vlen
Definition: SGVector.h:494
virtual CFeatures * apply_backward_elimination(CFeatures *features)
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
int32_t get_num_features() const
EFeatureRemovalPolicy get_policy() const
static SGVector< index_t > argsort(SGVector< T > vector)
Definition: Math.h:1599
virtual EFeatureClass get_feature_class()
virtual EFeatureClass get_feature_class() const =0
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
virtual EFeatureType get_feature_type()
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
index_t get_num_features(CFeatures *features) const
int machine_int_t
Definition: common.h:59
The class Features is the base class of all feature objects.
Definition: Features.h:68
EFeatureSelectionAlgorithm
Class Preprocessor defines a preprocessor interface.
Definition: Preprocessor.h:75
void set_target_dim(index_t target_dim)
#define SG_ADD(...)
Definition: SGObject.h:81
virtual CFeatures * apply(CFeatures *features)
CLabels * get_labels() const

SHOGUN 机器学习工具包 - 项目文档