SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedDocConverter.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/lib/Hash.h>
18 
19 
20 using namespace shogun;
21 
22 namespace shogun
23 {
25 {
26  init(NULL, 16, false, 1, 0);
27 }
28 
29 CHashedDocConverter::CHashedDocConverter(int32_t hash_bits, bool normalize,
30  int32_t n_grams, int32_t skips) : CConverter()
31 {
32  init(NULL, hash_bits, normalize, n_grams, skips);
33 }
34 
36  int32_t hash_bits, bool normalize, int32_t n_grams, int32_t skips) : CConverter()
37 {
38  init(tzer, hash_bits, normalize, n_grams, skips);
39 }
40 
42 {
44 }
45 
46 void CHashedDocConverter::init(CTokenizer* tzer, int32_t hash_bits, bool normalize,
47  int32_t n_grams, int32_t skips)
48 {
49  num_bits = hash_bits;
50  should_normalize = normalize;
51  ngrams = n_grams;
52  tokens_to_skip = skips;
53 
54  if (tzer==NULL)
55  {
57  tk->delimiters[(uint8_t) ' '] = 1;
58  tk->delimiters[(uint8_t) '\t'] = 1;
59  tokenizer = tk;
60  }
61  else
62  tokenizer = tzer;
63 
65  SG_ADD(&num_bits, "num_bits", "Number of bits of the hash",
67  SG_ADD(&ngrams, "ngrams", "Number of consecutive tokens",
69  SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip",
71  SG_ADD(&should_normalize, "should_normalize", "Whether to normalize vectors or not",
73  m_parameters->add((CSGObject**) &tokenizer, "tokenizer",
74  "Tokenizer");
75 }
76 
77 const char* CHashedDocConverter::get_name() const
78 {
79  return "HashedDocConverter";
80 }
81 
83 {
84  ASSERT(features);
85  if (strcmp(features->get_name(), "StringFeatures")!=0)
86  SG_ERROR("CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
87 
88  CStringFeatures<char>* s_features = (CStringFeatures<char>*) features;
89 
90  int32_t dim = CMath::pow(2, num_bits);
91  SGSparseMatrix<float64_t> matrix(dim,features->get_num_vectors());
92  for (index_t vec_idx=0; vec_idx<s_features->get_num_vectors(); vec_idx++)
93  {
94  SGVector<char> doc = s_features->get_feature_vector(vec_idx);
95  matrix[vec_idx] = apply(doc);
96  s_features->free_feature_vector(doc, vec_idx);
97  }
98 
99  return (CFeatures*) new CSparseFeatures<float64_t>(matrix);
100 }
101 
103 {
104  ASSERT(document.size()>0)
105  const int32_t array_size = 1024*1024;
106  CDynamicArray<uint32_t> hashed_indices(array_size);
109 
110  const int32_t seed = 0xdeadbeaf;
111  tokenizer->set_text(document);
112  index_t token_start = 0;
113  int32_t n = 0;
114 
116  while (n<ngrams-1+tokens_to_skip && tokenizer->has_next())
117  {
118  index_t end = tokenizer->next_token_idx(token_start);
119  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
120  end-token_start, seed);
121  cached_hashes->append_element(token_hash);
122  n++;
123  }
124 
126  while (tokenizer->has_next())
127  {
128  index_t end = tokenizer->next_token_idx(token_start);
129  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
130  end-token_start, seed);
131  cached_hashes->append_element(token_hash);
132  CHashedDocConverter::generate_ngram_hashes(cached_hashes, ngram_indices, num_bits,
134 
135  for (index_t i=0; i<ngram_indices->get_num_elements(); i++)
136  hashed_indices.append_element(ngram_indices->get_element(i));
137 
138  cached_hashes->delete_element(0);
139  }
140 
142  if (ngrams>1)
143  {
144  while (cached_hashes->get_num_elements()>0)
145  {
146  CHashedDocConverter::generate_ngram_hashes(cached_hashes, ngram_indices, num_bits,
148 
149  for (index_t i=0; i<ngram_indices->get_num_elements(); i++)
150  hashed_indices.append_element(ngram_indices->get_element(i));
151 
152  cached_hashes->delete_element(0);
153  }
154  }
155  SG_UNREF(cached_hashes);
156  SG_UNREF(ngram_indices);
157 
158  SGSparseVector<float64_t> sparse_doc_rep = create_hashed_representation(hashed_indices);
159 
161  if (should_normalize)
162  {
163  float64_t norm_const = CMath::sqrt((float64_t) document.size());
164  for (index_t i=0; i<sparse_doc_rep.num_feat_entries; i++)
165  sparse_doc_rep.features[i].entry /= norm_const;
166  }
167 
168  return sparse_doc_rep;
169 }
170 
172 {
173  int32_t num_nnz_features = count_distinct_indices(hashed_indices);
174 
175  SGSparseVector<float64_t> sparse_doc_rep(num_nnz_features);
176  index_t sparse_idx = 0;
177  for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
178  {
179  sparse_doc_rep.features[sparse_idx].feat_index = hashed_indices[i];
180  sparse_doc_rep.features[sparse_idx].entry = 1;
181  while ( (i+1<hashed_indices.get_num_elements()) &&
182  (hashed_indices[i+1]==hashed_indices[i]) )
183  {
184  sparse_doc_rep.features[sparse_idx].entry++;
185  i++;
186  }
187  sparse_idx++;
188  }
189  return sparse_doc_rep;
190 }
191 
193  CDynamicArray<index_t>* ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
194 {
195  while (ngram_hashes->get_num_elements()>0)
196  ngram_hashes->delete_element(0);
197 
198  ngram_hashes->append_element(hashes->get_element(0) & ((1 << num_bits) -1));
199  for (index_t n=1; n<ngrams; n++)
200  {
201  for (index_t s=0; s<=tokens_to_skip; s++)
202  {
203  if ( n+s >= hashes->get_num_elements())
204  break;
205 
206  uint32_t ngram_hash = hashes->get_element(0);
207  for (index_t i=1+s; i<=n+s; i++)
208  ngram_hash = ngram_hash ^ hashes->get_element(i);
209  ngram_hash = ngram_hash & ((1 << num_bits) - 1);
210  ngram_hashes->append_element(ngram_hash);
211  }
212  }
213 }
214 
216 {
217  CMath::qsort(hashed_indices.get_array(), hashed_indices.get_num_elements());
218 
220  int32_t num_nnz_features = 0;
221  for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
222  {
223  num_nnz_features++;
224  while ( (i+1<hashed_indices.get_num_elements()) &&
225  (hashed_indices[i+1]==hashed_indices[i]) )
226  {
227  i++;
228  }
229  }
230  return num_nnz_features;
231 }
232 
234 {
235  should_normalize = normalize;
236 }
237 
238 void CHashedDocConverter::set_k_skip_n_grams(int32_t k, int32_t n)
239 {
240  tokens_to_skip = k;
241  ngrams = n;
242 }
243 }

SHOGUN Machine Learning Toolbox - Documentation