SHOGUN  6.1.3
DataFetcher.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2016 - 2017 Soumyajit De
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #include <algorithm>
32 #include <numeric>
35 
36 using namespace shogun;
37 using namespace internal;
38 
39 DataFetcher::DataFetcher() : m_num_samples(0), train_test_mode(false),
40  train_mode(false), m_samples(nullptr), features_shuffled(false)
41 {
42 }
43 
44 DataFetcher::DataFetcher(CFeatures* samples) : train_test_mode(false),
45  train_mode(false), m_samples(samples), features_shuffled(false)
46 {
47  REQUIRE(m_samples!=nullptr, "Samples cannot be null!\n");
48  SG_REF(m_samples);
49  m_num_samples=m_samples->get_num_vectors();
50 }
51 
52 DataFetcher::~DataFetcher()
53 {
54  SG_UNREF(m_samples);
55 }
56 
57 void DataFetcher::set_blockwise(bool blockwise)
58 {
59  if (blockwise)
60  {
61  m_block_details=last_blockwise_details;
62  SG_SDEBUG("Restoring the blockwise details!\n");
63  m_block_details.m_full_data=false;
64  }
65  else
66  {
67  last_blockwise_details=m_block_details;
68  SG_SDEBUG("Saving the blockwise details!\n");
69  m_block_details=BlockwiseDetails();
70  }
71 }
72 
73 void DataFetcher::set_train_test_mode(bool on)
74 {
75  train_test_mode=on;
76 }
77 
78 bool DataFetcher::is_train_test_mode() const
79 {
80  return train_test_mode;
81 }
82 
83 void DataFetcher::set_train_mode(bool on)
84 {
85  train_mode=on;
86 }
87 
88 bool DataFetcher::is_train_mode() const
89 {
90  return train_mode;
91 }
92 
93 void DataFetcher::set_train_test_ratio(float64_t ratio)
94 {
95  train_test_ratio=ratio;
96 }
97 
98 float64_t DataFetcher::get_train_test_ratio() const
99 {
100  return train_test_ratio;
101 }
102 
103 void DataFetcher::shuffle_features()
104 {
105  REQUIRE(train_test_mode, "This method is allowed only when Train/Test method is active!\n");
106  if (features_shuffled)
107  {
108  SG_SWARNING("Features are already shuffled! Call to shuffle_features() has no effect."
109  "If you want to reshuffle, please call unshuffle_features() first and then call this method!\n");
110  }
111  else
112  {
113  const index_t size=m_samples->get_num_vectors();
114  SG_SDEBUG("Current number of feature vectors = %d\n", size);
115  if (shuffle_subset.size()<size)
116  {
117  SG_SDEBUG("Resizing the shuffle indices vector (from %d to %d)\n", shuffle_subset.size(), size);
118  shuffle_subset=SGVector<index_t>(size);
119  }
120  std::iota(shuffle_subset.data(), shuffle_subset.data()+shuffle_subset.size(), 0);
121  CMath::permute(shuffle_subset);
122 // shuffle_subset.display_vector("shuffle_subset");
123 
124  SG_SDEBUG("Shuffling %d feature vectors\n", size);
125  m_samples->add_subset(shuffle_subset);
126 
127  features_shuffled=true;
128  }
129 }
130 
131 void DataFetcher::unshuffle_features()
132 {
133  REQUIRE(train_test_mode, "This method is allowed only when Train/Test method is active!\n");
134  if (features_shuffled)
135  {
136  m_samples->remove_subset();
137  features_shuffled=false;
138  }
139  else
140  {
141  SG_SWARNING("Features are NOT shuffled! Call to unshuffle_features() has no effect."
142  "If you want to reshuffle, please call shuffle_features() instead!\n");
143  }
144 }
145 
146 void DataFetcher::use_fold(index_t idx)
147 {
148  allocate_active_subset();
149  auto num_samples_per_fold=get_num_samples()/get_num_folds();
150  auto start_idx=idx*num_samples_per_fold;
151  if (train_mode)
152  {
153  std::iota(active_subset.data(), active_subset.data()+active_subset.size(), 0);
154  if (start_idx<active_subset.size())
155  {
156  std::for_each(active_subset.data()+start_idx, active_subset.data()+active_subset.size(),
157  [&num_samples_per_fold](index_t& val)
158  {
159  val+=num_samples_per_fold;
160  });
161  }
162  }
163  else
164  std::iota(active_subset.data(), active_subset.data()+active_subset.size(), start_idx);
165 // active_subset.display_vector("active_subset");
166 }
167 
168 void DataFetcher::init_active_subset()
169 {
170  allocate_active_subset();
171  index_t start_index=0;
172  if (!train_mode)
173  start_index=m_samples->get_num_vectors()*train_test_ratio/(train_test_ratio+1);
174  std::iota(active_subset.data(), active_subset.data()+active_subset.size(), start_index);
175 // active_subset.display_vector("active_subset");
176 }
177 
178 void DataFetcher::start()
179 {
180  REQUIRE(get_num_samples()>0, "Number of samples is 0!\n");
181  if (train_test_mode)
182  {
183  m_samples->add_subset(active_subset);
184  SG_SDEBUG("Added active subset!\n");
185  SG_SINFO("Currently active number of samples is %d\n", get_num_samples());
186  }
187 
188  if (m_block_details.m_full_data || m_block_details.m_blocksize>get_num_samples())
189  {
190  SG_SINFO("Fetching entire data (%d samples)!\n", get_num_samples());
191  m_block_details.with_blocksize(get_num_samples());
192  }
193  m_block_details.m_total_num_blocks=get_num_samples()/m_block_details.m_blocksize;
194  reset();
195 }
196 
197 CFeatures* DataFetcher::next()
198 {
199  CFeatures* next_samples=nullptr;
200  // figure out how many samples to fetch in this burst
201  auto num_already_fetched=m_block_details.m_next_block_index*m_block_details.m_blocksize;
202  auto num_more_samples=get_num_samples()-num_already_fetched;
203  if (num_more_samples>0)
204  {
205  // create a shallow copy and add proper index subset
206  next_samples=m_samples->shallow_subset_copy();
207  auto num_samples_this_burst=std::min(m_block_details.m_max_num_samples_per_burst, num_more_samples);
208  if (num_samples_this_burst<next_samples->get_num_vectors())
209  {
210  SGVector<index_t> inds(num_samples_this_burst);
211  std::iota(inds.vector, inds.vector+inds.vlen, num_already_fetched);
212  next_samples->add_subset(inds);
213  }
214  m_block_details.m_next_block_index+=m_block_details.m_num_blocks_per_burst;
215  }
216  return next_samples;
217 }
218 
219 void DataFetcher::reset()
220 {
221  m_block_details.m_next_block_index=0;
222 }
223 
224 void DataFetcher::end()
225 {
226  if (train_test_mode)
227  {
228  m_samples->remove_subset();
229  SG_SDEBUG("Removed active subset!\n");
230  SG_SINFO("Currently active number of samples is %d\n", get_num_samples());
231  }
232 }
233 
234 index_t DataFetcher::get_num_samples() const
235 {
236  if (train_test_mode)
237  {
238  if (train_mode)
239  return m_num_samples*train_test_ratio/(train_test_ratio+1);
240  else
241  return m_num_samples/(train_test_ratio+1);
242  }
243  return m_samples->get_num_vectors();
244 }
245 
246 index_t DataFetcher::get_num_folds() const
247 {
248  return 1+ceil(get_train_test_ratio());
249 }
250 
251 index_t DataFetcher::get_num_training_samples() const
252 {
253  return get_num_samples()*get_train_test_ratio()/(get_train_test_ratio()+1);
254 }
255 
256 index_t DataFetcher::get_num_testing_samples() const
257 {
258  return get_num_samples()/(get_train_test_ratio()+1);
259 }
260 
261 BlockwiseDetails& DataFetcher::fetch_blockwise()
262 {
263  m_block_details.m_full_data=false;
264  return m_block_details;
265 }
266 
267 void DataFetcher::allocate_active_subset()
268 {
269  REQUIRE(train_test_mode, "This method is allowed only when Train/Test method is active!\n");
270  index_t num_active_samples=0;
271  if (train_mode)
272  {
273  num_active_samples=m_samples->get_num_vectors()*train_test_ratio/(train_test_ratio+1);
274  SG_SINFO("Using %d number of samples for this fold as training samples!\n", num_active_samples);
275  }
276  else
277  {
278  num_active_samples=m_samples->get_num_vectors()/(train_test_ratio+1);
279  SG_SINFO("Using %d number of samples for this fold as testing samples!\n", num_active_samples);
280  }
281 
282  ASSERT(num_active_samples>0);
283  if (active_subset.size()!=num_active_samples)
284  {
285  SG_SDEBUG("Resizing the active subset from %d to %d\n", active_subset.size(), num_active_samples);
286  active_subset=SGVector<index_t>(num_active_samples);
287  }
288 }
virtual CFeatures * shallow_subset_copy()
Definition: Features.h:353
static void permute(SGVector< T > v, CRandom *rand=NULL)
Definition: Math.h:962
int32_t index_t
Definition: common.h:72
#define SG_SWARNING(...)
Definition: SGIO.h:163
#define REQUIRE(x,...)
Definition: SGIO.h:181
#define SG_REF(x)
Definition: SGObject.h:52
#define ASSERT(x)
Definition: SGIO.h:176
double float64_t
Definition: common.h:60
#define SG_UNREF(x)
Definition: SGObject.h:53
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
#define SG_SDEBUG(...)
Definition: SGIO.h:153
virtual void remove_subset()
Definition: Features.cpp:322
The class Features is the base class of all feature objects.
Definition: Features.h:69
#define SG_SINFO(...)
Definition: SGIO.h:158
Class that holds block-details for the data-fetchers. There are one instance of this class per fetche...
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:310

SHOGUN Machine Learning Toolbox - Documentation