SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedDocDotFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/lib/Hash.h>
15 
16 namespace shogun
17 {
19  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size)
20 {
21  if (n_grams < 1)
22  n_grams = 1;
23 
24  if ( (n_grams==1 && skips!=0) || (skips<0))
25  skips = 0;
26 
27  init(hash_bits, docs, tzer, normalize, n_grams, skips);
28 }
29 
31 : CDotFeatures(orig)
32 {
33  init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize,
34  orig.ngrams, orig.tokens_to_skip);
35 }
36 
38 {
40 }
41 
42 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs,
43  CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips)
44 {
45  num_bits = hash_bits;
46  ngrams = n_grams;
47  tokens_to_skip = skips;
48  doc_collection = docs;
49  tokenizer = tzer;
50  should_normalize = normalize;
51 
52  if (!tokenizer)
53  {
55  ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace();
56  }
57 
58  SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE);
59  SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support",
61  SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features",
63  SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection",
65  SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer",
67  SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products",
69 
72 }
73 
75 {
78 }
79 
81 {
82  return CMath::pow(2, num_bits);
83 }
84 
85 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
86 {
87  ASSERT(df)
88  ASSERT(df->get_name() == get_name())
89 
91 
93  SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
94 
97  SGSparseVector<float64_t> cv1 = converter->apply(sv1);
98  SGSparseVector<float64_t> cv2 = converter->apply(sv2);
100 
101  doc_collection->free_feature_vector(sv1, vec_idx1);
102  hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
103  SG_UNREF(converter);
104 
105  return result;
106 }
107 
109 {
110  return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
111 }
112 
113 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
114 {
115  ASSERT(vec2_len == CMath::pow(2,num_bits))
117 
120 
121  float64_t result = 0;
122  CTokenizer* local_tzer = tokenizer->get_copy();
123 
124  const int32_t seed = 0xdeadbeaf;
125  local_tzer->set_text(sv);
126  index_t start = 0;
127  int32_t n = 0;
128  while (n<ngrams-1+tokens_to_skip && local_tzer->has_next())
129  {
130  index_t end = local_tzer->next_token_idx(start);
131  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
132  hashes->append_element(token_hash);
133  n++;
134  }
135 
136  while (local_tzer->has_next())
137  {
138  index_t end = local_tzer->next_token_idx(start);
139  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
140  hashes->append_element(token_hash);
143 
144  for (index_t i=0; i<hashed_indices->get_num_elements(); i++)
145  result += vec2[hashed_indices->get_element(i)];
146 
147  hashes->delete_element(0);
148  }
149 
150  if (ngrams>1)
151  {
152  while (hashes->get_num_elements()>0)
153  {
156 
157  for (index_t i=0; i<hashed_indices->get_num_elements(); i++)
158  result += vec2[hashed_indices->get_element(i)];
159 
160  hashes->delete_element(0);
161  }
162  }
163  doc_collection->free_feature_vector(sv, vec_idx1);
164  SG_UNREF(local_tzer);
165  SG_UNREF(hashes);
166  SG_UNREF(hashed_indices);
167  return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result;
168 }
169 
171  float64_t* vec2, int32_t vec2_len, bool abs_val)
172 {
173  ASSERT(vec2_len == CMath::pow(2,num_bits))
174 
175  if (abs_val)
176  alpha = CMath::abs(alpha);
177 
179  const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha;
180 
183 
184  CTokenizer* local_tzer = tokenizer->get_copy();
185 
186  const int32_t seed = 0xdeadbeaf;
187  local_tzer->set_text(sv);
188  index_t start = 0;
189  int32_t n = 0;
190  while (n<ngrams-1+tokens_to_skip && local_tzer->has_next())
191  {
192  index_t end = local_tzer->next_token_idx(start);
193  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
194  hashes->append_element(token_hash);
195  n++;
196  }
197 
198  while (local_tzer->has_next())
199  {
200  index_t end = local_tzer->next_token_idx(start);
201  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
202  hashes->append_element(token_hash);
205 
206  for (index_t i=0; i<hashed_indices->get_num_elements(); i++)
207  vec2[hashed_indices->get_element(i)] += value;
208 
209  hashes->delete_element(0);
210  }
211 
212  if (ngrams>1)
213  {
214  while (hashes->get_num_elements()>0)
215  {
218 
219  for (index_t i=0; i<hashed_indices->get_num_elements(); i++)
220  vec2[hashed_indices->get_element(i)] += value;
221 
222  hashes->delete_element(0);
223  }
224  }
225 
226  doc_collection->free_feature_vector(sv, vec_idx1);
227  SG_UNREF(local_tzer);
228  SG_UNREF(hashes);
229  SG_UNREF(hashed_indices);
230 }
231 
233  int32_t length, int32_t num_bits, uint32_t seed)
234 {
235  int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed);
236  return hash & ((1 << num_bits) - 1);
237 }
238 
240 {
242  doc_collection = docs;
243 }
244 
246 {
248  int32_t num_nnz_features = sv.size();
250  return num_nnz_features;
251 }
252 
254 {
256  return NULL;
257 }
258 
259 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
260 {
262  return false;
263 }
264 
266 {
268 }
269 
271 {
272  return "HashedDocDotFeatures";
273 }
274 
276 {
277  return new CHashedDocDotFeatures(*this);
278 }
279 
281 {
282  return F_UINT;
283 }
284 
286 {
287  return C_SPARSE;
288 }
289 
291 {
293 }
294 }

SHOGUN Machine Learning Toolbox - Documentation