SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedSparseFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/base/Parameter.h>
14 #include <shogun/lib/Hash.h>
15 #include <shogun/io/SGIO.h>
19 #include <string.h>
20 #include <iostream>
21 
22 namespace shogun {
23 
24 template <class ST>
26  bool keep_lin_terms) : CDotFeatures(size)
27 {
28  init(NULL, 0, use_quadr, keep_lin_terms);
29 }
30 
31 template <class ST>
33  bool use_quadr, bool keep_lin_terms) : CDotFeatures()
34 {
35  init(feats, d, use_quadr, keep_lin_terms);
36 }
37 
38 template <class ST>
40  bool use_quadr, bool keep_lin_terms) : CDotFeatures()
41 {
42  CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(matrix);
43  init(feats, d, use_quadr, keep_lin_terms);
44 }
45 
46 template <class ST>
47 CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr,
48  bool keep_lin_terms) : CDotFeatures(loader)
49 {
51  feats->load(loader);
52  init(feats, d, use_quadr, keep_lin_terms);
53 }
54 
55 template <class ST>
56 void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr,
57  bool keep_lin_terms)
58 {
59  dim = d;
60  use_quadratic = use_quadr;
61  keep_linear_terms = keep_lin_terms;
62  sparse_feats = feats;
63  SG_REF(sparse_feats);
64 
65  SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
67  SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
69  SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
70  SG_ADD((CSGObject** ) &sparse_feats, "sparse_feats", "Sparse features to work on",
72 
73  set_generic<ST>();
74 }
75 
76 template <class ST>
78 : CDotFeatures(orig)
79 {
80  init(orig.sparse_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
81 }
82 
83 template <class ST>
85 {
86  SG_UNREF(sparse_feats);
87 }
88 
89 template <class ST>
91 {
92  return new CHashedSparseFeatures<ST>(*this);
93 }
94 
95 template <class ST>
97 {
98  return dim;
99 }
100 
101 template <class ST>
103  int32_t vec_idx) const
104 {
105  return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx),
106  dim, use_quadratic, keep_linear_terms);
107 }
108 
109 template <class ST>
111  bool use_quadratic, bool keep_linear_terms)
112 {
113  return CHashedDenseFeatures<ST>::hash_vector(vec, dim, use_quadratic, keep_linear_terms);
114 }
115 
116 template <class ST>
118  bool use_quadratic, bool keep_linear_terms)
119 {
120  SGVector<ST> h_vec(dim);
121  SGVector<ST>::fill_vector(h_vec, dim, 0);
122 
123  int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
124  SGVector<uint32_t> hash_cache(hash_cache_size);
125 
126  for (index_t i=0; i<vec.num_feat_entries; i++)
127  {
128  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
129  vec.features[i].feat_index);
130 
131  if (use_quadratic)
132  hash_cache[i] = hash;
133 
134  if ( (!use_quadratic) || keep_linear_terms )
135  h_vec[hash % dim] += vec.features[i].entry;
136  }
137 
138  if (use_quadratic)
139  {
140  for (index_t i=0; i<vec.num_feat_entries; i++)
141  {
142  index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
143  index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof(index_t),
144  vec.features[i].feat_index) % dim;
145 
146  h_vec[idx] += vec.features[i].entry * vec.features[i].entry;
147 
148  for (index_t j=i+1; j<vec.num_feat_entries; j++)
149  {
150  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
151  h_vec[idx] += vec.features[i].entry * vec.features[j].entry;
152  }
153  }
154  }
155 
156  int32_t num_nnz_features = 0;
157  for (index_t i=0; i<dim; i++)
158  {
159  if (h_vec[i]!=0)
160  num_nnz_features++;
161  }
162 
163  SGSparseVector<ST> sv(num_nnz_features);
164 
165  int32_t sparse_index = 0;
166  for (index_t i=0; i<dim; i++)
167  {
168  if (h_vec[i]!=0)
169  {
170  sv.features[sparse_index].entry = h_vec[i];
171  sv.features[sparse_index++].feat_index = i;
172  }
173  }
174 
175  return sv;
176 }
177 
178 template <class ST>
180  int32_t vec_idx2)
181 {
182  ASSERT(df)
183  ASSERT(df->get_feature_type() == get_feature_type())
184  ASSERT(df->get_feature_class() == get_feature_class())
185  ASSERT(strcmp(df->get_name(), get_name())==0)
186 
188  SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1);
189  SGSparseVector<ST> vec_2 = feats->get_hashed_feature_vector(vec_idx2);
190 
191  float64_t result = vec_1.sparse_dot(vec_2);
192  return result;
193 }
194 
195 template <class ST>
197  int32_t vec2_len)
198 {
199  ASSERT(vec2_len == dim)
200 
201  SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
202 
203  int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
204  SGVector<uint32_t> hash_cache(hash_cache_size);
205 
206  float64_t result = 0;
207  for (index_t i=0; i<vec.num_feat_entries; i++)
208  {
209  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
210  vec.features[i].feat_index);
211 
212  if (use_quadratic)
213  hash_cache[i] = hash;
214 
215  if ( (!use_quadratic) || keep_linear_terms)
216  result += vec2[hash % dim] * vec.features[i].entry;
217  }
218 
219  if (use_quadratic)
220  {
221  for (index_t i=0; i<vec.num_feat_entries; i++)
222  {
223  index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
224  index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
225  vec.features[i].feat_index) % dim;
226 
227  result += vec2[idx] * vec.features[i].entry * vec.features[i].entry;
228 
229  for (index_t j=i+1; j<vec.num_feat_entries; j++)
230  {
231  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
232  result += vec2[idx] * vec.features[i].entry * vec.features[j].entry;
233  }
234  }
235  }
236 
237  sparse_feats ->free_feature_vector(vec_idx1);
238  return result;
239 }
240 
241 template <class ST>
243  float64_t* vec2, int32_t vec2_len, bool abs_val)
244 {
245  float64_t val = abs_val ? CMath::abs(alpha) : alpha;
246  ASSERT(vec2_len == dim)
247 
248  SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
249 
250  int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
251  SGVector<uint32_t> hash_cache(hash_cache_size);
252 
253  for (index_t i=0; i<vec.num_feat_entries; i++)
254  {
255  uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
256  vec.features[i].feat_index);
257  if (use_quadratic)
258  hash_cache[i] = hash;
259 
260  if ( (!use_quadratic) || keep_linear_terms)
261  vec2[hash % dim] += val * vec.features[i].entry;
262  }
263 
264  if (use_quadratic)
265  {
266  for (index_t i=0; i<vec.num_feat_entries; i++)
267  {
268  index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
269  index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
270  vec.features[i].feat_index) % dim;
271 
272  vec2[idx] += val * vec.features[i].entry * vec.features[i].entry;
273 
274  for (index_t j=i+1; j<vec.num_feat_entries; j++)
275  {
276  idx = (hash_cache[i] ^ hash_cache[j]) % dim;
277  vec2[idx] += val * vec.features[i].entry * vec.features[j].entry;
278  }
279  }
280  }
281  sparse_feats ->free_feature_vector(vec_idx1);
282 }
283 
284 template <class ST>
286 {
287  return dim;
288 }
289 
290 template <class ST>
292 {
294  return NULL;
295 }
296 template <class ST>
298  void* iterator)
299 {
301  return false;
302 }
303 template <class ST>
305 {
307 }
308 
309 template <class ST>
311 {
312  return "HashedSparseFeatures";
313 }
314 
315 template <class ST>
317 {
318  return F_UINT;
319 }
320 
321 template <class ST>
323 {
324  return C_SPARSE;
325 }
326 
327 template <class ST>
329 {
330  return sparse_feats ->get_num_vectors();
331 }
332 
333 template class CHashedSparseFeatures <bool>;
334 template class CHashedSparseFeatures <char>;
335 template class CHashedSparseFeatures <int8_t>;
336 template class CHashedSparseFeatures <uint8_t>;
337 template class CHashedSparseFeatures <int16_t>;
338 template class CHashedSparseFeatures <uint16_t>;
339 template class CHashedSparseFeatures <int32_t>;
340 template class CHashedSparseFeatures <uint32_t>;
341 template class CHashedSparseFeatures <int64_t>;
342 template class CHashedSparseFeatures <uint64_t>;
343 template class CHashedSparseFeatures <float32_t>;
344 template class CHashedSparseFeatures <float64_t>;
346 }
virtual const char * get_name() const =0
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:223
T sparse_dot(const SGSparseVector< T > &v)
int32_t index_t
Definition: common.h:62
CSparseFeatures< ST > * sparse_feats
virtual int32_t get_dim_feature_space() const
static SGSparseVector< ST > hash_vector(SGVector< ST > vec, int32_t dim, bool use_quadratic=false, bool keep_linear_terms=true)
Template class SparseFeatures implements sparse matrices.
CHashedSparseFeatures(int32_t size=0, bool use_quadr=false, bool keep_lin_terms=true)
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
virtual EFeatureType get_feature_type() const
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:51
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
shogun vector
virtual int32_t get_nnz_features_for_vector(int32_t num)
double float64_t
Definition: common.h:50
This class is identical to the CDenseFeatures class except that it hashes each dimension to a new fea...
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
A File access base class.
Definition: File.h:34
virtual void free_feature_iterator(void *iterator)
virtual EFeatureClass get_feature_class() const =0
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
void load(CFile *loader)
SGSparseVectorEntry< T > * features
virtual const char * get_name() const
virtual CFeatures * duplicate() const
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual int32_t get_num_vectors() const
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual EFeatureClass get_feature_class() const
SGSparseVector< ST > get_hashed_feature_vector(int32_t vec_idx) const
virtual void * get_feature_iterator(int32_t vector_index)
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
#define SG_ADD(...)
Definition: SGObject.h:81
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
static SGSparseVector< ST > hash_vector(SGVector< ST > vec, int32_t dim, bool use_quadratic=false, bool keep_linear_terms=true)
virtual EFeatureType get_feature_type() const =0
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation