11 #ifndef _HASHEDDOCCONVERTER__H__
12 #define _HASHEDDOCCONVERTER__H__
53 CHashedDocConverter(int32_t hash_bits,
bool normalize =
false, int32_t n_grams = 1, int32_t skips = 0);
105 virtual const char*
get_name()
const;
124 void init(
CTokenizer* tzer, int32_t d,
bool normalize, int32_t n_grams, int32_t skips);
class Converter used to convert data
virtual ~CHashedDocConverter()
void init(CTokenizer *tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips)
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
int32_t count_distinct_indices(CDynamicArray< uint32_t > &hashed_indices)
Template class SparseFeatures implements sparse matrices.
Template Dynamic array class that creates an array that can be used like a list or an array...
virtual const char * get_name() const
This class can be used to convert a document collection contained in a CStringFeatures object w...
SGSparseVector< float64_t > create_hashed_representation(CDynamicArray< uint32_t > &hashed_indices)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
all of classes and functions are contained in the shogun namespace
The class Features is the base class of all feature objects.
void set_k_skip_n_grams(int32_t k, int32_t n)
virtual CFeatures * apply(CFeatures *features)
void set_normalization(bool normalize)