SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedDocConverter.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
13 #include <shogun/lib/Hash.h>
18 
19 using namespace shogun;
20 
21 namespace shogun
22 {
24 {
25  init(NULL, 16, false, 1, 0);
26 }
27 
28 CHashedDocConverter::CHashedDocConverter(int32_t hash_bits, bool normalize,
29  int32_t n_grams, int32_t skips) : CConverter()
30 {
31  init(NULL, hash_bits, normalize, n_grams, skips);
32 }
33 
35  int32_t hash_bits, bool normalize, int32_t n_grams, int32_t skips) : CConverter()
36 {
37  init(tzer, hash_bits, normalize, n_grams, skips);
38 }
39 
41 {
43 }
44 
45 void CHashedDocConverter::init(CTokenizer* tzer, int32_t hash_bits, bool normalize,
46  int32_t n_grams, int32_t skips)
47 {
48  num_bits = hash_bits;
49  should_normalize = normalize;
50  ngrams = n_grams;
51  tokens_to_skip = skips;
52 
53  if (tzer==NULL)
54  {
56  tk->delimiters[(uint8_t) ' '] = 1;
57  tk->delimiters[(uint8_t) '\t'] = 1;
58  tokenizer = tk;
59  }
60  else
61  tokenizer = tzer;
62 
64  SG_ADD(&num_bits, "num_bits", "Number of bits of the hash",
66  SG_ADD(&ngrams, "ngrams", "Number of consecutive tokens",
68  SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip",
70  SG_ADD(&should_normalize, "should_normalize", "Whether to normalize vectors or not",
72  m_parameters->add((CSGObject**) &tokenizer, "tokenizer",
73  "Tokenizer");
74 }
75 
76 const char* CHashedDocConverter::get_name() const
77 {
78  return "HashedDocConverter";
79 }
80 
82 {
83  ASSERT(features);
84  if (strcmp(features->get_name(), "StringFeatures")!=0)
85  SG_ERROR("CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
86 
87  CStringFeatures<char>* s_features = (CStringFeatures<char>*) features;
88 
89  int32_t dim = CMath::pow(2, num_bits);
90  SGSparseMatrix<float64_t> matrix(dim,features->get_num_vectors());
91  for (index_t vec_idx=0; vec_idx<s_features->get_num_vectors(); vec_idx++)
92  {
93  SGVector<char> doc = s_features->get_feature_vector(vec_idx);
94  matrix[vec_idx] = apply(doc);
95  s_features->free_feature_vector(doc, vec_idx);
96  }
97 
98  return (CFeatures*) new CSparseFeatures<float64_t>(matrix);
99 }
100 
102 {
103  ASSERT(document.size()>0)
104  const int32_t array_size = 1024*1024;
106  CDynamicArray<uint32_t> hashed_indices(array_size);
107 
111  index_t hashes_start = 0;
112  index_t hashes_end = 0;
113  int32_t len = cached_hashes.vlen - 1;
114 
117  SGVector<index_t> ngram_indices((ngrams-1)*(tokens_to_skip+1) + 1);
118 
120  const int32_t seed = 0xdeadbeaf;
121  tokenizer->set_text(document);
122  index_t token_start = 0;
123  while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next())
124  {
125  index_t end = tokenizer->next_token_idx(token_start);
126  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
127  end-token_start, seed);
128  cached_hashes[hashes_end++] = token_hash;
129  }
130 
132  while (tokenizer->has_next())
133  {
134  index_t end = tokenizer->next_token_idx(token_start);
135  uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
136  end-token_start, seed);
137  cached_hashes[hashes_end] = token_hash;
138 
139  CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start, len,
140  ngram_indices, num_bits, ngrams, tokens_to_skip);
141 
142  for (index_t i=0; i<ngram_indices.vlen; i++)
143  hashed_indices.append_element(ngram_indices[i]);
144 
145  hashes_start++;
146  hashes_end++;
147  if (hashes_end==cached_hashes.vlen)
148  hashes_end = 0;
149  if (hashes_start==cached_hashes.vlen)
150  hashes_start = 0;
151  }
152 
154  if (ngrams>1)
155  {
156  while (hashes_start!=hashes_end)
157  {
158  len--;
159  index_t max_idx = CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start,
160  len, ngram_indices, num_bits, ngrams, tokens_to_skip);
161 
162  for (index_t i=0; i<max_idx; i++)
163  hashed_indices.append_element(ngram_indices[i]);
164 
165  hashes_start++;
166  if (hashes_start==cached_hashes.vlen)
167  hashes_start = 0;
168  }
169  }
170 
171  SGSparseVector<float64_t> sparse_doc_rep = create_hashed_representation(hashed_indices);
172 
174  if (should_normalize)
175  {
176  float64_t norm_const = CMath::sqrt((float64_t) document.size());
177  for (index_t i=0; i<sparse_doc_rep.num_feat_entries; i++)
178  sparse_doc_rep.features[i].entry /= norm_const;
179  }
180 
181  return sparse_doc_rep;
182 }
183 
185 {
186  int32_t num_nnz_features = count_distinct_indices(hashed_indices);
187 
188  SGSparseVector<float64_t> sparse_doc_rep(num_nnz_features);
189  index_t sparse_idx = 0;
190  for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
191  {
192  sparse_doc_rep.features[sparse_idx].feat_index = hashed_indices[i];
193  sparse_doc_rep.features[sparse_idx].entry = 1;
194  while ( (i+1<hashed_indices.get_num_elements()) &&
195  (hashed_indices[i+1]==hashed_indices[i]) )
196  {
197  sparse_doc_rep.features[sparse_idx].entry++;
198  i++;
199  }
200  sparse_idx++;
201  }
202  return sparse_doc_rep;
203 }
204 
206  index_t len, SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
207 {
208  index_t h_idx = 0;
209  ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 << num_bits) -1);
210 
211  for (index_t n=1; n<ngrams; n++)
212  {
213  for (index_t s=0; s<=tokens_to_skip; s++)
214  {
215  if ( n+s > len)
216  break;
217 
218  uint32_t ngram_hash = hashes[hashes_start];
219  for (index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++)
220  ngram_hash = ngram_hash ^ hashes[i % hashes.vlen];
221  ngram_hash = ngram_hash & ((1 << num_bits) - 1);
222  ngram_hashes[h_idx++] = ngram_hash;
223  }
224  }
225  return h_idx;
226 }
227 
229 {
230  CMath::qsort(hashed_indices.get_array(), hashed_indices.get_num_elements());
231 
233  int32_t num_nnz_features = 0;
234  for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
235  {
236  num_nnz_features++;
237  while ( (i+1<hashed_indices.get_num_elements()) &&
238  (hashed_indices[i+1]==hashed_indices[i]) )
239  {
240  i++;
241  }
242  }
243  return num_nnz_features;
244 }
245 
247 {
248  should_normalize = normalize;
249 }
250 
251 void CHashedDocConverter::set_k_skip_n_grams(int32_t k, int32_t n)
252 {
253  tokens_to_skip = k;
254  ngrams = n;
255 }
256 }
virtual const char * get_name() const =0
class Converter used to convert data
Definition: Converter.h:26
SGVector< ST > get_feature_vector(int32_t num)
void init(CTokenizer *tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips)
int32_t index_t
Definition: common.h:62
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17
virtual int32_t get_num_vectors() const
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
int32_t count_distinct_indices(CDynamicArray< uint32_t > &hashed_indices)
virtual int32_t get_num_vectors() const =0
T * get_array() const
Definition: DynamicArray.h:408
#define SG_ERROR(...)
Definition: SGIO.h:129
Parameter * m_parameters
Definition: SGObject.h:546
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22
#define SG_REF(x)
Definition: SGObject.h:54
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366
static void qsort(T *output, int32_t size)
Definition: Math.h:1313
int32_t size() const
Definition: SGVector.h:113
void add(bool *param, const char *name, const char *description="")
Definition: Parameter.cpp:37
virtual const char * get_name() const
index_t vlen
Definition: SGVector.h:494
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115
SGSparseVector< float64_t > create_hashed_representation(CDynamicArray< uint32_t > &hashed_indices)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
double float64_t
Definition: common.h:50
virtual bool has_next()=0
SGSparseVectorEntry< T > * features
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
virtual index_t next_token_idx(index_t &start)=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
void set_k_skip_n_grams(int32_t k, int32_t n)
int32_t get_num_elements() const
Definition: DynamicArray.h:200
#define SG_ADD(...)
Definition: SGObject.h:84
static float32_t sqrt(float32_t x)
Definition: Math.h:459
virtual CFeatures * apply(CFeatures *features)
void set_normalization(bool normalize)
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535

SHOGUN Machine Learning Toolbox - Documentation