20 using namespace shogun;
26 init(NULL, 16,
false, 1, 0);
32 init(NULL, hash_bits, normalize, n_grams, skips);
36 int32_t hash_bits,
bool normalize, int32_t n_grams, int32_t skips) :
CConverter()
38 init(tzer, hash_bits, normalize, n_grams, skips);
46 void CHashedDocConverter::init(
CTokenizer* tzer, int32_t hash_bits,
bool normalize,
47 int32_t n_grams, int32_t skips)
67 SG_ADD(&
ngrams,
"ngrams",
"Number of consecutive tokens",
79 return "HashedDocConverter";
85 if (strcmp(features->
get_name(),
"StringFeatures")!=0)
86 SG_ERROR(
"CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
95 matrix[vec_idx] =
apply(doc);
105 const int32_t array_size = 1024*1024;
110 const int32_t seed = 0xdeadbeaf;
116 while (n<ngrams-1+tokens_to_skip && tokenizer->has_next())
120 end-token_start, seed);
130 end-token_start, seed);
136 hashed_indices.append_element(ngram_indices->
get_element(i));
150 hashed_indices.append_element(ngram_indices->
get_element(i));
168 return sparse_doc_rep;
182 (hashed_indices[i+1]==hashed_indices[i]) )
189 return sparse_doc_rep;
207 for (
index_t i=1+s; i<=n+s; i++)
209 ngram_hash = ngram_hash & ((1 <<
num_bits) - 1);
220 int32_t num_nnz_features = 0;
225 (hashed_indices[i+1]==hashed_indices[i]) )
230 return num_nnz_features;