24     if ( (n_grams==1 && skips!=0) || (skips<0))
 
   27     init(hash_bits, docs, tzer, normalize, n_grams, skips);
 
   43     CTokenizer* tzer, 
bool normalize, int32_t n_grams, int32_t skips)
 
   59     SG_ADD(&
ngrams, 
"ngrams", 
"Number of tokens to combine for quadratic feature support",
 
   93     SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
 
  102     hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
 
  124     int32_t len = hashes.
vlen - 1;
 
  134     const int32_t seed = 0xdeadbeaf;
 
  137     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
 
  141         hashes[hashes_end++] = token_hash;
 
  149         hashes[hashes_end] = token_hash;
 
  155             result += vec2[hashed_indices[i]];
 
  159         if (hashes_end==hashes.
vlen)
 
  161         if (hashes_start==hashes.
vlen)
 
  167         while (hashes_start!=hashes_end)
 
  173             for (
index_t i=0; i<max_idx; i++)
 
  174                 result += vec2[hashed_indices[i]];
 
  177             if (hashes_start==hashes.
vlen)
 
  187     float64_t* vec2, int32_t vec2_len, 
bool abs_val)
 
  211     const int32_t seed = 0xdeadbeaf;
 
  214     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
 
  218         hashes[hashes_end++] = token_hash;
 
  225         hashes[hashes_end] = token_hash;
 
  231             vec2[hashed_indices[i]] += value;
 
  235         if (hashes_end==hashes.vlen)
 
  237         if (hashes_start==hashes.vlen)
 
  243         while (hashes_start!=hashes_end)
 
  249             for (
index_t i=0; i<max_idx; i++)
 
  250                 vec2[hashed_indices[i]] += value;
 
  253             if (hashes_start==hashes.vlen)
 
  263         int32_t length, int32_t num_bits, uint32_t seed)
 
  266     return hash & ((1 << 
num_bits) - 1);
 
  278     int32_t num_nnz_features = sv.
size();
 
  280     return num_nnz_features;
 
  302     return "HashedDocDotFeatures";
 
virtual const char * get_name() const =0
SGVector< ST > get_feature_vector(int32_t num)
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
T sparse_dot(const SGSparseVector< T > &v)
virtual void free_feature_iterator(void *iterator)
virtual void set_text(SGVector< char > txt)
virtual int32_t get_num_vectors() const 
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
virtual EFeatureClass get_feature_class() const 
#define SG_NOTIMPLEMENTED
virtual int32_t get_dim_feature_space() const 
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual void * get_feature_iterator(int32_t vector_index)
virtual const char * get_name() const 
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Features that support dot products among other operations. 
EFeatureClass
shogun feature class 
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector< float64_t > vec2)
This class can be used to convert a document collection contained in a CStringFeatures object w...
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
CStringFeatures< char > * doc_collection
virtual ~CHashedDocDotFeatures()
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
A File access base class. 
virtual CFeatures * duplicate() const 
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
virtual bool has_next()=0
static uint32_t calculate_token_hash(char *token, int32_t length, int32_t num_bits, uint32_t seed)
void set_doc_collection(CStringFeatures< char > *docs)
virtual int32_t get_num_vectors() const 
EFeatureType
shogun feature type 
CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures< char > *docs=NULL, CTokenizer *tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0)
all of classes and functions are contained in the shogun namespace 
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
virtual index_t next_token_idx(index_t &start)=0
The class Features is the base class of all feature objects. 
virtual CTokenizer * get_copy()=0
virtual EFeatureType get_feature_type() const 
static float32_t sqrt(float32_t x)
virtual CFeatures * apply(CFeatures *features)
This class can be used to provide on-the-fly vectorization of a document collection. Like in the standard Bag-of-Words representation, this class considers each document as a collection of tokens, which are then hashed into a new feature space of a specified dimension. This class is very flexible and allows the user to specify the tokenizer used to tokenize each document, specify whether the results should be normalized with regards to the sqrt of the document size, as well as to specify whether he wants to combine different tokens. The latter implements a k-skip n-grams approach, meaning that you can combine up to n tokens, while skipping up to k. Eg. for the tokens ["a", "b", "c", "d"], with n_grams = 2 and skips = 2, one would get the following combinations : ["a", "ab", "ac" (skipped 1), "ad" (skipped 2), "b", "bc", "bd" (skipped 1), "c", "cd", "d"]. 
virtual int32_t get_nnz_features_for_vector(int32_t num)
static int32_t pow(bool x, int32_t n)