SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedDenseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
12 #include <shogun/base/Parameter.h>
13 #include <shogun/lib/Hash.h>
15 #include <shogun/io/SGIO.h>
17 
18 #include <string.h>
19 
20 namespace shogun {
21 template <class ST>
22 CHashedDenseFeatures<ST>::CHashedDenseFeatures(int32_t size, bool use_quadr, bool keep_lin_terms)
23 : CDotFeatures(size)
24 {
25  init(NULL, 0, use_quadr, keep_lin_terms);
26 }
27 
28 template <class ST>
30  bool use_quadr, bool keep_lin_terms) : CDotFeatures()
31 {
32  init(feats, d, use_quadr, keep_lin_terms);
33 }
34 
35 template <class ST>
37  bool keep_lin_terms) : CDotFeatures()
38 {
39  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(matrix);
40  init(feats, d, use_quadr, keep_lin_terms);
41 }
42 
43 template <class ST>
44 CHashedDenseFeatures<ST>::CHashedDenseFeatures(ST* src, int32_t num_feat, int32_t num_vec,
45  int32_t d, bool use_quadr, bool keep_lin_terms) : CDotFeatures()
46 {
47  CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(src, num_feat, num_vec);
48  init(feats, d, use_quadr, keep_lin_terms);
49 }
50 
51 template <class ST>
52 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CFile* loader, int32_t d, bool use_quadr,
53  bool keep_lin_terms) : CDotFeatures(loader)
54 {
56  feats->load(loader);
57  init(feats, d, use_quadr, keep_lin_terms);
58 }
59 
60 template <class ST>
61 void CHashedDenseFeatures<ST>::init(CDenseFeatures<ST>* feats, int32_t d, bool use_quadr,
62  bool keep_lin_terms)
63 {
64  dim = d;
65  dense_feats = feats;
66  SG_REF(dense_feats);
67  use_quadratic = use_quadr;
68  keep_linear_terms = keep_lin_terms;
69 
70  SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
72  SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
74  SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
75  SG_ADD((CSGObject** ) &dense_feats, "dense_feats", "Dense features to work on",
77 
78  set_generic<ST>();
79 }
80 
81 template <class ST>
83 : CDotFeatures(orig)
84 {
85  init(orig.dense_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
86 }
87 
88 template <class ST>
90 {
91  SG_UNREF(dense_feats);
92 }
93 
94 template <class ST>
96 {
97  return new CHashedDenseFeatures<ST>(*this);
98 }
99 
100 template <class ST>
102 {
103  return dim;
104 }
105 
106 template <class ST>
108  int32_t vec_idx2)
109 {
110  ASSERT(df)
111  ASSERT(df->get_feature_type() == get_feature_type())
112  ASSERT(df->get_feature_class() == get_feature_class())
113  ASSERT(strcmp(df->get_name(), get_name())==0)
114 
116  ASSERT(feats->get_dim_feature_space() == get_dim_feature_space())
117 
118  SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1);
119 
120  bool same_vec = (df == this) && (vec_idx1 == vec_idx2);
121  SGSparseVector<ST> vec_2 = same_vec ? vec_1 : feats->get_hashed_feature_vector(vec_idx2);
122  float64_t result = vec_1.sparse_dot(vec_2);
123 
124  return result;
125 }
126 
127 template <class ST>
129  int32_t vec2_len)
130 {
131  ASSERT(vec2_len == dim)
132 
133  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
134 
135  float64_t result = 0;
136 
137  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
138  SGVector<uint32_t> hash_cache(hash_cache_size);
139 
140  for (index_t i=0; i<vec.vlen; i++)
141  {
142  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
143  if (use_quadratic)
144  hash_cache[i] = h_idx;
145 
146  if ( (!use_quadratic) || keep_linear_terms)
147  result += vec2[h_idx % dim] * vec[i];
148  }
149 
150  if (use_quadratic)
151  {
152  for (index_t i=0; i<vec.size(); i++)
153  {
154  int32_t n_idx = i * vec.size() + i;
155  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
156  result += vec2[idx] * vec[i] * vec[i];
157 
158  for (index_t j=i+1; j<vec.size(); j++)
159  {
160  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
161  result += vec2[idx] * vec[i] * vec[j];
162  }
163  }
164  }
165 
166  dense_feats->free_feature_vector(vec, vec_idx1);
167  return result;
168 }
169 
170 template <class ST>
172  float64_t* vec2, int32_t vec2_len, bool abs_val)
173 {
174  float64_t val = abs_val ? CMath::abs(alpha) : alpha;
175  ASSERT(vec2_len == dim)
176 
177  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
178 
179  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
180  SGVector<uint32_t> hash_cache(hash_cache_size);
181 
182  for (index_t i=0; i<vec.vlen; i++)
183  {
184  uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
185 
186  if (use_quadratic)
187  hash_cache[i] = h_idx;
188 
189  if ( (!use_quadratic) || keep_linear_terms)
190  vec2[h_idx % dim] += val * vec[i];
191  }
192 
193  if (use_quadratic)
194  {
195  for (index_t i=0; i<vec.size(); i++)
196  {
197  int32_t n_idx = i * vec.size() + i;
198  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
199  vec2[idx] += val * vec[i] * vec[i];
200 
201  for (index_t j=i+1; j<vec.size(); j++)
202  {
203  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
204  vec2[idx] += val * vec[i] * vec[j];
205  }
206  }
207  }
208  dense_feats->free_feature_vector(vec, vec_idx1);
209 }
210 
211 template <class ST>
213 {
214  return dim;
215 }
216 
217 template <class ST>
219 {
221  return NULL;
222 }
223 template <class ST>
225  void* iterator)
226 {
228  return false;
229 }
230 template <class ST>
232 {
234 }
235 
236 template <class ST>
238 {
239  return "HashedDenseFeatures";
240 }
241 
242 template <class ST>
244 {
245  return F_UINT;
246 }
247 
248 template <class ST>
250 {
251  return C_SPARSE;
252 }
253 
254 template <class ST>
256 {
257  return dense_feats->get_num_vectors();
258 }
259 
260 template <class ST>
262 {
263  SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx);
265  vec, dim, use_quadratic, keep_linear_terms);
266  dense_feats->free_feature_vector(vec, vec_idx);
267  return hashed_vec;
268 }
269 
270 template <class ST>
272  bool use_quadratic, bool keep_linear_terms)
273 {
274  SGVector<ST> h_vec(dim);
275  SGVector<ST>::fill_vector(h_vec.vector, dim, 0);
276 
277  int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
278  SGVector<uint32_t> hash_cache(hash_cache_size);
279 
280  for (index_t i=0; i<vec.size(); i++)
281  {
282  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
283  if (use_quadratic)
284  hash_cache[i] = hash;
285 
286  if ( (!use_quadratic) || keep_linear_terms)
287  h_vec[hash % dim] += vec[i];
288  }
289 
290  if (use_quadratic)
291  {
292  for (index_t i=0; i<vec.size(); i++)
293  {
294  index_t n_idx = i * vec.size() + i;
295  uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
296  h_vec[idx] += vec[i] * vec[i];
297 
298  for (index_t j=i+1; j<vec.size(); j++)
299  {
300  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
301  h_vec[idx] += vec[i] * vec[j];
302  }
303  }
304  }
305 
306  int32_t num_nnz_feats = 0;
307  for(index_t i=0; i<dim; i++)
308  {
309  if (h_vec[i]!=0)
310  num_nnz_feats++;
311  }
312 
313  SGSparseVector<ST> hashed_vector(num_nnz_feats);
314 
315  int32_t sparse_feat_index = 0;
316  for (index_t i=0; i<dim; i++)
317  {
318  if (h_vec[i]!=0)
319  {
320  hashed_vector.features[sparse_feat_index].feat_index = i;
321  hashed_vector.features[sparse_feat_index++].entry = h_vec[i];
322  }
323  }
324 
325  return hashed_vector;
326 }
327 
328 template class CHashedDenseFeatures<bool>;
329 template class CHashedDenseFeatures<char>;
330 template class CHashedDenseFeatures<int8_t>;
331 template class CHashedDenseFeatures<uint8_t>;
332 template class CHashedDenseFeatures<int16_t>;
333 template class CHashedDenseFeatures<uint16_t>;
334 template class CHashedDenseFeatures<int32_t>;
335 template class CHashedDenseFeatures<uint32_t>;
336 template class CHashedDenseFeatures<int64_t>;
337 template class CHashedDenseFeatures<uint64_t>;
338 template class CHashedDenseFeatures<float32_t>;
339 template class CHashedDenseFeatures<float64_t>;
340 template class CHashedDenseFeatures<floatmax_t>;
341 }

SHOGUN Machine Learning Toolbox - Documentation