SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedDocDotFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/lib/Hash.h>
15 
16 namespace shogun
17 {
19  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size)
20 {
21  if (n_grams < 1)
22  n_grams = 1;
23 
24  if ( (n_grams==1 && skips!=0) || (skips<0))
25  skips = 0;
26 
27  init(hash_bits, docs, tzer, normalize, n_grams, skips);
28 }
29 
31 : CDotFeatures(orig)
32 {
33  init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize,
34  orig.ngrams, orig.tokens_to_skip);
35 }
36 
38 {
40 }
41 
42 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs,
43  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips)
44 {
45  num_bits = hash_bits;
46  ngrams = n_grams;
47  tokens_to_skip = skips;
48  doc_collection = docs;
49  tokenizer = tzer;
50  should_normalize = normalize;
51 
52  if (!tokenizer)
53  {
55  ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace();
56  }
57 
58  SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE);
59  SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support",
61  SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features",
63  SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection",
65  SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer",
67  SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products",
69 
72 }
73 
75 {
78 }
79 
81 {
82  return CMath::pow(2, num_bits);
83 }
84 
85 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
86 {
87  ASSERT(df)
88  ASSERT(df->get_name() == get_name())
89 
91 
93  SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
94 
97  SGSparseVector<float64_t> cv1 = converter->apply(sv1);
98  SGSparseVector<float64_t> cv2 = converter->apply(sv2);
100 
101  doc_collection->free_feature_vector(sv1, vec_idx1);
102  hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
103  SG_UNREF(converter);
104 
105  return result;
106 }
107 
109 {
110  return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
111 }
112 
113 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
114 {
115  ASSERT(vec2_len == CMath::pow(2,num_bits))
116 
118 
122  index_t hashes_start = 0;
123  index_t hashes_end = 0;
124  int32_t len = hashes.vlen - 1;
125 
128  SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
129 
130  float64_t result = 0;
131  CTokenizer* local_tzer = tokenizer->get_copy();
132 
134  const int32_t seed = 0xdeadbeaf;
135  local_tzer->set_text(sv);
136  index_t start = 0;
137  while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
138  {
139  index_t end = local_tzer->next_token_idx(start);
140  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
141  hashes[hashes_end++] = token_hash;
142  }
143 
145  while (local_tzer->has_next())
146  {
147  index_t end = local_tzer->next_token_idx(start);
148  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
149  hashes[hashes_end] = token_hash;
150 
151  CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
153 
154  for (index_t i=0; i<hashed_indices.vlen; i++)
155  result += vec2[hashed_indices[i]];
156 
157  hashes_start++;
158  hashes_end++;
159  if (hashes_end==hashes.vlen)
160  hashes_end = 0;
161  if (hashes_start==hashes.vlen)
162  hashes_start = 0;
163  }
164 
165  if (ngrams>1)
166  {
167  while (hashes_start!=hashes_end)
168  {
169  len--;
170  index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start,
171  len, hashed_indices, num_bits, ngrams, tokens_to_skip);
172 
173  for (index_t i=0; i<max_idx; i++)
174  result += vec2[hashed_indices[i]];
175 
176  hashes_start++;
177  if (hashes_start==hashes.vlen)
178  hashes_start = 0;
179  }
180  }
181  doc_collection->free_feature_vector(sv, vec_idx1);
182  SG_UNREF(local_tzer);
183  return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result;
184 }
185 
187  float64_t* vec2, int32_t vec2_len, bool abs_val)
188 {
189  ASSERT(vec2_len == CMath::pow(2,num_bits))
190 
191  if (abs_val)
192  alpha = CMath::abs(alpha);
193 
195  const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha;
196 
200  index_t hashes_start = 0;
201  index_t hashes_end = 0;
202  index_t len = hashes.vlen - 1;
203 
206  SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
207 
208  CTokenizer* local_tzer = tokenizer->get_copy();
209 
211  const int32_t seed = 0xdeadbeaf;
212  local_tzer->set_text(sv);
213  index_t start = 0;
214  while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
215  {
216  index_t end = local_tzer->next_token_idx(start);
217  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
218  hashes[hashes_end++] = token_hash;
219  }
220 
221  while (local_tzer->has_next())
222  {
223  index_t end = local_tzer->next_token_idx(start);
224  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
225  hashes[hashes_end] = token_hash;
226 
227  CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
229 
230  for (index_t i=0; i<hashed_indices.vlen; i++)
231  vec2[hashed_indices[i]] += value;
232 
233  hashes_start++;
234  hashes_end++;
235  if (hashes_end==hashes.vlen)
236  hashes_end = 0;
237  if (hashes_start==hashes.vlen)
238  hashes_start = 0;
239  }
240 
241  if (ngrams>1)
242  {
243  while (hashes_start!=hashes_end)
244  {
245  len--;
247  hashes_start, len, hashed_indices, num_bits, ngrams, tokens_to_skip);
248 
249  for (index_t i=0; i<max_idx; i++)
250  vec2[hashed_indices[i]] += value;
251 
252  hashes_start++;
253  if (hashes_start==hashes.vlen)
254  hashes_start = 0;
255  }
256  }
257 
258  doc_collection->free_feature_vector(sv, vec_idx1);
259  SG_UNREF(local_tzer);
260 }
261 
263  int32_t length, int32_t num_bits, uint32_t seed)
264 {
265  int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed);
266  return hash & ((1 << num_bits) - 1);
267 }
268 
270 {
272  doc_collection = docs;
273 }
274 
276 {
278  int32_t num_nnz_features = sv.size();
280  return num_nnz_features;
281 }
282 
284 {
286  return NULL;
287 }
288 
289 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
290 {
292  return false;
293 }
294 
296 {
298 }
299 
301 {
302  return "HashedDocDotFeatures";
303 }
304 
306 {
307  return new CHashedDocDotFeatures(*this);
308 }
309 
311 {
312  return F_UINT;
313 }
314 
316 {
317  return C_SPARSE;
318 }
319 
321 {
323 }
324 }
virtual const char * get_name() const =0
SGVector< ST > get_feature_vector(int32_t num)
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
T sparse_dot(const SGSparseVector< T > &v)
int32_t index_t
Definition: common.h:62
virtual void free_feature_iterator(void *iterator)
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
virtual int32_t get_num_vectors() const
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
virtual EFeatureClass get_feature_class() const
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139
virtual int32_t get_dim_feature_space() const
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual void * get_feature_iterator(int32_t vector_index)
virtual const char * get_name() const
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Features that support dot products among other operations.
Definition: DotFeatures.h:44
#define SG_REF(x)
Definition: SGObject.h:51
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366
virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector< float64_t > vec2)
int32_t size() const
Definition: SGVector.h:115
index_t vlen
Definition: SGVector.h:494
This class can be used to convert a document collection contained in a CStringFeatures object w...
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
#define ASSERT(x)
Definition: SGIO.h:201
CStringFeatures< char > * doc_collection
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
double float64_t
Definition: common.h:50
A File access base class.
Definition: File.h:34
virtual CFeatures * duplicate() const
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
virtual bool has_next()=0
static uint32_t calculate_token_hash(char *token, int32_t length, int32_t num_bits, uint32_t seed)
void set_doc_collection(CStringFeatures< char > *docs)
virtual int32_t get_num_vectors() const
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
#define SG_UNREF(x)
Definition: SGObject.h:52
CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures< char > *docs=NULL, CTokenizer *tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
virtual index_t next_token_idx(index_t &start)=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual CTokenizer * get_copy()=0
virtual EFeatureType get_feature_type() const
#define SG_ADD(...)
Definition: SGObject.h:81
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual CFeatures * apply(CFeatures *features)
This class can be used to provide on-the-fly vectorization of a document collection. Like in the standard Bag-of-Words representation, this class considers each document as a collection of tokens, which are then hashed into a new feature space of a specified dimension. This class is very flexible and allows the user to specify the tokenizer used to tokenize each document, specify whether the results should be normalized with regards to the sqrt of the document size, as well as to specify whether he wants to combine different tokens. The latter implements a k-skip n-grams approach, meaning that you can combine up to n tokens, while skipping up to k. Eg. for the tokens ["a", "b", "c", "d"], with n_grams = 2 and skips = 2, one would get the following combinations : ["a", "ab", "ac" (skipped 1), "ad" (skipped 2), "b", "bc", "bd" (skipped 1), "c", "cd", "d"].
virtual int32_t get_nnz_features_for_vector(int32_t num)
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535
static T abs(T a)
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation