SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
DependenceMaximization.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Soumyajit De
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #include <shogun/lib/SGMatrix.h>
32 #include <shogun/labels/Labels.h>
37 
38 using namespace shogun;
39 
42 {
43  init();
44 }
45 
46 void CDependenceMaximization::init()
47 {
48  SG_ADD((CSGObject**)&m_estimator, "estimator",
49  "the estimator for computing measures", MS_NOT_AVAILABLE);
50  SG_ADD((CSGObject**)&m_labels_feats, "labels_feats",
51  "the features based on labels", MS_NOT_AVAILABLE);
52 
53  m_estimator=NULL;
54  m_labels_feats=NULL;
55 }
56 
58 {
61 }
62 
63 bool CDependenceMaximization::init(CFeatures* features)
64 {
65  REQUIRE(features, "Features are not initialized!\n");
66  REQUIRE(features->get_feature_class()==C_DENSE ||
67  features->get_feature_class()==C_SPARSE,
68  "Only allowed for dense/sparse features! Provided an instance of "
69  "%s which is of class %d!\n",
70  features->get_name(), features->get_feature_class());
71  REQUIRE(features->get_feature_type()==F_DREAL, "Only allowed for "
72  "features of double type! Provided %d!\n",
73  features->get_feature_type());
74 
75  return true;
76 }
77 
79  index_t idx)
80 {
81  SG_DEBUG("Entering!\n");
82 
83  // remove the dimension specified by the index, i.e. get X\X_i
84  // NULL check is handled in CFeatureSelection::get_num_features call
85  index_t num_features=get_num_features(features);
86  REQUIRE(num_features>idx, "Specified dimension to remove (%d) is greater "
87  "than the total number of current features (%d)!\n",
88  idx, num_features);
89 
90  SGVector<index_t> dims(num_features-1);
91  index_t n_dims=0;
92  for (index_t i=0; i<num_features; ++i)
93  {
94  if (i!=idx)
95  dims[n_dims++]=i;
96  }
97 
99  dims.display_vector("dims");
100 
101  // the following already does a SG_REF on the newly created feature
102  SG_DEBUG("Leaving!\n");
103  return features->copy_dimension_subset(dims);
104 }
105 
107  index_t idx)
108 {
109  SG_DEBUG("Entering!\n");
110 
111  // remove the dimension (feat) specified by the index idx
112  CFeatures* reduced_feats=create_transformed_copy(features, idx);
113  ASSERT(reduced_feats);
114 
115  // perform an independence test for X\X_i ~ p and Y ~ q with
116  // H_0: P(X\X_i, Y) = P(X\X_i) * P(Y)
117  // the test statistic can then be used as a measure of dependence
118  // See CIndependenceTest class documentation for details
119  m_estimator->set_p(reduced_feats);
121 
122  SG_DEBUG("statistic = %f!\n", statistic);
123 
124  SG_UNREF(reduced_feats);
125 
126  SG_DEBUG("Leaving!\n");
127  return statistic;
128 }
129 
131  SGVector<index_t> argsorted)
132 {
133  SG_DEBUG("Entering!\n");
134 
135  REQUIRE(m_num_remove>0, "Number or percentage of features to be removed is "
136  "not set! Please use set_num_remove() to set this!\n");
138  "Only N_LARGEST and PERCENTILE_LARGEST removal policy can work "
139  "with %s!\n", get_name());
140  REQUIRE(features, "Features is not intialized!\n");
141  REQUIRE(argsorted.vector, "The argsorted vector is not initialized!\n");
142  REQUIRE(get_num_features(features)==argsorted.vlen,
143  "argsorted vector should be equal to the number of features (%d)! "
144  "But it was %d!\n", argsorted.vlen);
145 
146  // compute a threshold to remove for both the policies
147  index_t threshold=m_num_remove;
149  threshold*=argsorted.vlen*0.01;
150 
151  // make sure that the threshold is valid given the current number of feats
152  REQUIRE(threshold<argsorted.vlen, "The threshold of removal is too high "
153  "(asked to remove %d features out of %d)! Please use a smaller "
154  "number for removal using set_num_remove() call",
155  threshold, argsorted.vlen);
156 
157  // remove the highest rank holders by storing indices
158  SGVector<index_t> inds(argsorted.vlen-threshold);
159  memcpy(inds.vector, argsorted.vector, sizeof(index_t)*inds.vlen);
160 
161  // sorting the indices to get the original order
162  CMath::qsort(inds);
164  inds.display_vector("selected feats");
165 
166  // copy rest of the features and SG_UNREF the original feat obj
167  CFeatures* reduced_feats=features->copy_dimension_subset(inds);
168 
169  // add the selected features to the subset
171  m_subset->add_subset(inds);
172 
173  SG_UNREF(features);
174 
175  SG_DEBUG("Leaving!\n");
176  return reduced_feats;
177 }
178 
180 {
181  REQUIRE(policy==N_LARGEST || policy==PERCENTILE_LARGEST,
182  "Only N_LARGEST and PERCENTILE_LARGEST removal policy can work "
183  "with %s!\n", get_name());
184  m_policy=policy;
185 }
186 
188 {
189  // NULL check is handled in base class CFeatureSelection
191 
192  // convert the CLabels object to CDenseFeatures
194 
195  SGMatrix<float64_t> labels_matrix(1, m_labels->get_num_labels());
196  for (index_t i=0; i<labels_matrix.num_cols; ++i)
197  labels_matrix.matrix[i]=m_labels->get_value(i);
198 
199  m_labels_feats=new CDenseFeatures<float64_t>(labels_matrix);
201 
202  // we need to set this to the estimator which is set internally
205 }
virtual const char * get_name() const =0
virtual float64_t get_value(int32_t idx)
Definition: Labels.cpp:59
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual int32_t get_num_labels() const =0
virtual const char * get_name() const
#define REQUIRE(x,...)
Definition: SGIO.h:206
virtual void set_p(CFeatures *p)
virtual float64_t compute_measures(CFeatures *features, index_t idx)
#define SG_REF(x)
Definition: SGObject.h:54
Template class CFeatureSelection, base class for all feature selection preprocessors which select a s...
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
void display_vector(const char *name="vector", const char *prefix="") const
Definition: SGVector.cpp:354
virtual void set_labels(CLabels *labels)
index_t vlen
Definition: SGVector.h:494
virtual void add_subset(SGVector< index_t > subset)
Definition: SubsetStack.cpp:80
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
double float64_t
Definition: common.h:50
virtual EFeatureClass get_feature_class() const =0
virtual bool init(CFeatures *features)
EMessageType get_loglevel() const
Definition: SGIO.cpp:285
#define SG_UNREF(x)
Definition: SGObject.h:55
virtual CFeatures * remove_feats(CFeatures *features, SGVector< index_t > ranks)
#define SG_DEBUG(...)
Definition: SGIO.h:107
virtual CFeatures * copy_dimension_subset(SGVector< index_t > dims)
Definition: Features.cpp:348
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
index_t get_num_features(CFeatures *features) const
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual void set_labels(CLabels *labels)
virtual CFeatures * create_transformed_copy(CFeatures *features, index_t idx)
virtual void set_policy(EFeatureRemovalPolicy policy)
#define SG_ADD(...)
Definition: SGObject.h:84
virtual float64_t compute_statistic()=0
virtual EFeatureType get_feature_type() const =0
virtual void set_q(CFeatures *q)

SHOGUN Machine Learning Toolbox - Documentation