SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedDenseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/Parameter.h>
13 #include <shogun/lib/Hash.h>
15 #include <shogun/io/SGIO.h>
17 
18 #include <shogun/lib/SGVector.h>
20 
21 #include <string.h>
22 
23 namespace shogun {
24 template <class ST>
25 CHashedDenseFeatures<ST>::CHashedDenseFeatures(int32_t size, bool use_quadr, bool keep_lin_terms)
26 : CDotFeatures(size)
27 {
28  init(NULL, 0, use_quadr, keep_lin_terms);
29 }
30 
31 template <class ST>
33  bool use_quadr, bool keep_lin_terms) : CDotFeatures()
34 {
35  init(feats, d, use_quadr, keep_lin_terms);
36 }
37 
38 template <class ST>
40  bool keep_lin_terms) : CDotFeatures()
41 {
42  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(matrix);
43  init(feats, d, use_quadr, keep_lin_terms);
44 }
45 
46 template <class ST>
47 CHashedDenseFeatures<ST>::CHashedDenseFeatures(ST* src, int32_t num_feat, int32_t num_vec,
48  int32_t d, bool use_quadr, bool keep_lin_terms) : CDotFeatures()
49 {
50  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(src, num_feat, num_vec);
51  init(feats, d, use_quadr, keep_lin_terms);
52 }
53 
54 template <class ST>
55 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CFile* loader, int32_t d, bool use_quadr,
56  bool keep_lin_terms) : CDotFeatures(loader)
57 {
59  feats->load(loader);
60  init(feats, d, use_quadr, keep_lin_terms);
61 }
62 
63 template <class ST>
64 void CHashedDenseFeatures<ST>::init(CDenseFeatures<ST>* feats, int32_t d, bool use_quadr,
65  bool keep_lin_terms)
66 {
67  dim = d;
68  dense_feats = feats;
69  SG_REF(dense_feats);
70  use_quadratic = use_quadr;
71  keep_linear_terms = keep_lin_terms;
72 
73  SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
75  SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
77  SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
78  SG_ADD((CSGObject** ) &dense_feats, "dense_feats", "Dense features to work on",
80 
81  set_generic<ST>();
82 }
83 
84 template <class ST>
86 : CDotFeatures(orig)
87 {
88  init(orig.dense_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
89 }
90 
91 template <class ST>
93 {
94  SG_UNREF(dense_feats);
95 }
96 
97 template <class ST>
99 {
100  return new CHashedDenseFeatures<ST>(*this);
101 }
102 
103 template <class ST>
105 {
106  return dim;
107 }
108 
109 template <class ST>
111  int32_t vec_idx2)
112 {
113  ASSERT(df)
114  ASSERT(df->get_feature_type() == get_feature_type())
115  ASSERT(df->get_feature_class() == get_feature_class())
116  ASSERT(strcmp(df->get_name(), get_name())==0)
117 
119  ASSERT(feats->get_dim_feature_space() == get_dim_feature_space())
120 
121  SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1);
122 
123  bool same_vec = (df == this) && (vec_idx1 == vec_idx2);
124  SGSparseVector<ST> vec_2 = same_vec ? vec_1 : feats->get_hashed_feature_vector(vec_idx2);
125  float64_t result = vec_1.sparse_dot(vec_2);
126 
127  return result;
128 }
129 
130 template <class ST>
132  int32_t vec2_len)
133 {
134  ASSERT(vec2_len == dim)
135 
136  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
137 
138  float64_t result = 0;
139 
140  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
141  SGVector<uint32_t> hash_cache(hash_cache_size);
142 
143  for (index_t i=0; i<vec.vlen; i++)
144  {
145  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
146  if (use_quadratic)
147  hash_cache[i] = h_idx;
148 
149  if ( (!use_quadratic) || keep_linear_terms)
150  result += vec2[h_idx % dim] * vec[i];
151  }
152 
153  if (use_quadratic)
154  {
155  for (index_t i=0; i<vec.size(); i++)
156  {
157  int32_t n_idx = i * vec.size() + i;
158  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
159  result += vec2[idx] * vec[i] * vec[i];
160 
161  for (index_t j=i+1; j<vec.size(); j++)
162  {
163  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
164  result += vec2[idx] * vec[i] * vec[j];
165  }
166  }
167  }
168 
169  dense_feats->free_feature_vector(vec, vec_idx1);
170  return result;
171 }
172 
173 template <class ST>
175  float64_t* vec2, int32_t vec2_len, bool abs_val)
176 {
177  float64_t val = abs_val ? CMath::abs(alpha) : alpha;
178  ASSERT(vec2_len == dim)
179 
180  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
181 
182  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
183  SGVector<uint32_t> hash_cache(hash_cache_size);
184 
185  for (index_t i=0; i<vec.vlen; i++)
186  {
187  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
188 
189  if (use_quadratic)
190  hash_cache[i] = h_idx;
191 
192  if ( (!use_quadratic) || keep_linear_terms)
193  vec2[h_idx % dim] += val * vec[i];
194  }
195 
196  if (use_quadratic)
197  {
198  for (index_t i=0; i<vec.size(); i++)
199  {
200  int32_t n_idx = i * vec.size() + i;
201  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
202  vec2[idx] += val * vec[i] * vec[i];
203 
204  for (index_t j=i+1; j<vec.size(); j++)
205  {
206  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
207  vec2[idx] += val * vec[i] * vec[j];
208  }
209  }
210  }
211  dense_feats->free_feature_vector(vec, vec_idx1);
212 }
213 
214 template <class ST>
216 {
217  return dim;
218 }
219 
220 template <class ST>
222 {
224  return NULL;
225 }
226 template <class ST>
228  void* iterator)
229 {
231  return false;
232 }
233 template <class ST>
235 {
237 }
238 
239 template <class ST>
241 {
242  return "HashedDenseFeatures";
243 }
244 
245 template <class ST>
247 {
248  return F_UINT;
249 }
250 
251 template <class ST>
253 {
254  return C_SPARSE;
255 }
256 
257 template <class ST>
259 {
260  return dense_feats->get_num_vectors();
261 }
262 
263 template <class ST>
265 {
266  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx);
268  vec, dim, use_quadratic, keep_linear_terms);
269  dense_feats->free_feature_vector(vec, vec_idx);
270  return hashed_vec;
271 }
272 
273 template <class ST>
275  bool use_quadratic, bool keep_linear_terms)
276 {
277  SGVector<ST> h_vec(dim);
278  SGVector<ST>::fill_vector(h_vec.vector, dim, 0);
279 
280  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
281  SGVector<uint32_t> hash_cache(hash_cache_size);
282 
283  for (index_t i=0; i<vec.size(); i++)
284  {
285  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
286  if (use_quadratic)
287  hash_cache[i] = hash;
288 
289  if ( (!use_quadratic) || keep_linear_terms)
290  h_vec[hash % dim] += vec[i];
291  }
292 
293  if (use_quadratic)
294  {
295  for (index_t i=0; i<vec.size(); i++)
296  {
297  index_t n_idx = i * vec.size() + i;
298  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
299  h_vec[idx] += vec[i] * vec[i];
300 
301  for (index_t j=i+1; j<vec.size(); j++)
302  {
303  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
304  h_vec[idx] += vec[i] * vec[j];
305  }
306  }
307  }
308 
309  int32_t num_nnz_feats = 0;
310  for(index_t i=0; i<dim; i++)
311  {
312  if (h_vec[i]!=0)
313  num_nnz_feats++;
314  }
315 
316  SGSparseVector<ST> hashed_vector(num_nnz_feats);
317 
318  int32_t sparse_feat_index = 0;
319  for (index_t i=0; i<dim; i++)
320  {
321  if (h_vec[i]!=0)
322  {
323  hashed_vector.features[sparse_feat_index].feat_index = i;
324  hashed_vector.features[sparse_feat_index++].entry = h_vec[i];
325  }
326  }
327 
328  return hashed_vector;
329 }
330 
331 template class CHashedDenseFeatures<bool>;
332 template class CHashedDenseFeatures<char>;
333 template class CHashedDenseFeatures<int8_t>;
334 template class CHashedDenseFeatures<uint8_t>;
335 template class CHashedDenseFeatures<int16_t>;
336 template class CHashedDenseFeatures<uint16_t>;
337 template class CHashedDenseFeatures<int32_t>;
338 template class CHashedDenseFeatures<uint32_t>;
339 template class CHashedDenseFeatures<int64_t>;
340 template class CHashedDenseFeatures<uint64_t>;
341 template class CHashedDenseFeatures<float32_t>;
342 template class CHashedDenseFeatures<float64_t>;
343 template class CHashedDenseFeatures<floatmax_t>;
344 }
virtual const char * get_name() const =0
CDenseFeatures< ST > * dense_feats
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:223
T sparse_dot(const SGSparseVector< T > &v)
The class DenseFeatures implements dense feature matrices.
Definition: LDA.h:41
virtual void load(CFile *loader)
int32_t index_t
Definition: common.h:62
static SGSparseVector< ST > hash_vector(SGVector< ST > vec, int32_t dim, bool use_quadratic=false, bool keep_linear_terms=true)
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
This class is identical to the CDenseFeatures class except that it hashes each dimension to a new fea...
virtual CFeatures * duplicate() const
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:51
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366
virtual const char * get_name() const
int32_t size() const
Definition: SGVector.h:115
index_t vlen
Definition: SGVector.h:494
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
shogun vector
double float64_t
Definition: common.h:50
A File access base class.
Definition: File.h:34
SGSparseVector< ST > get_hashed_feature_vector(int32_t vec_idx)
virtual EFeatureClass get_feature_class() const =0
CHashedDenseFeatures(int32_t size=0, bool use_quadr=false, bool keep_lin_terms=true)
virtual int32_t get_num_vectors() const
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
virtual EFeatureClass get_feature_class() const
SGSparseVectorEntry< T > * features
virtual void free_feature_iterator(void *iterator)
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual int32_t get_dim_feature_space() const
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual int32_t get_nnz_features_for_vector(int32_t num)
virtual EFeatureType get_feature_type() const
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
#define SG_ADD(...)
Definition: SGObject.h:81
virtual void * get_feature_iterator(int32_t vector_index)
virtual EFeatureType get_feature_type() const =0
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation