25 init(NULL, 16,
false, 1, 0);
31 init(NULL, hash_bits, normalize, n_grams, skips);
35 int32_t hash_bits,
bool normalize, int32_t n_grams, int32_t skips) :
CConverter()
37 init(tzer, hash_bits, normalize, n_grams, skips);
45 void CHashedDocConverter::init(
CTokenizer* tzer, int32_t hash_bits,
bool normalize,
46 int32_t n_grams, int32_t skips)
66 SG_ADD(&
ngrams,
"ngrams",
"Number of consecutive tokens",
78 return "HashedDocConverter";
84 if (strcmp(features->
get_name(),
"StringFeatures")!=0)
85 SG_ERROR(
"CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
94 matrix[vec_idx] =
apply(doc);
104 const int32_t array_size = 1024*1024;
113 int32_t len = cached_hashes.
vlen - 1;
120 const int32_t seed = 0xdeadbeaf;
123 while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next())
127 end-token_start, seed);
128 cached_hashes[hashes_end++] = token_hash;
136 end-token_start, seed);
137 cached_hashes[hashes_end] = token_hash;
143 hashed_indices.append_element(ngram_indices[i]);
147 if (hashes_end==cached_hashes.
vlen)
149 if (hashes_start==cached_hashes.
vlen)
156 while (hashes_start!=hashes_end)
162 for (
index_t i=0; i<max_idx; i++)
163 hashed_indices.append_element(ngram_indices[i]);
166 if (hashes_start==cached_hashes.
vlen)
181 return sparse_doc_rep;
195 (hashed_indices[i+1]==hashed_indices[i]) )
202 return sparse_doc_rep;
209 ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 <<
num_bits) -1);
218 uint32_t ngram_hash = hashes[hashes_start];
219 for (
index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++)
220 ngram_hash = ngram_hash ^ hashes[i % hashes.
vlen];
221 ngram_hash = ngram_hash & ((1 << num_bits) - 1);
222 ngram_hashes[h_idx++] = ngram_hash;
233 int32_t num_nnz_features = 0;
238 (hashed_indices[i+1]==hashed_indices[i]) )
243 return num_nnz_features;
virtual const char * get_name() const =0
class Converter used to convert data
SGVector< ST > get_feature_vector(int32_t num)
virtual ~CHashedDocConverter()
void init(CTokenizer *tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips)
virtual void set_text(SGVector< char > txt)
virtual int32_t get_num_vectors() const
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
int32_t count_distinct_indices(CDynamicArray< uint32_t > &hashed_indices)
virtual int32_t get_num_vectors() const =0
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Template Dynamic array class that creates an array that can be used like a list or an array...
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
static void qsort(T *output, int32_t size)
void add(bool *param, const char *name, const char *description="")
virtual const char * get_name() const
Class SGObject is the base class of all shogun objects.
SGSparseVector< float64_t > create_hashed_representation(CDynamicArray< uint32_t > &hashed_indices)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
virtual bool has_next()=0
SGSparseVectorEntry< T > * features
all of classes and functions are contained in the shogun namespace
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
SGVector< bool > delimiters
virtual index_t next_token_idx(index_t &start)=0
The class Features is the base class of all feature objects.
void set_k_skip_n_grams(int32_t k, int32_t n)
int32_t get_num_elements() const
static float32_t sqrt(float32_t x)
virtual CFeatures * apply(CFeatures *features)
void set_normalization(bool normalize)
static int32_t pow(bool x, int32_t n)