SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashedDocConverter.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
11 #ifndef _HASHEDDOCCONVERTER__H__
12 #define _HASHEDDOCCONVERTER__H__
13 
16 #include <shogun/lib/Tokenizer.h>
18 
19 namespace shogun
20 {
21 class CFeatures;
22 class CTokenizer;
23 class CConverter;
24 template<class T> class CSparseFeatures;
25 
38 {
39 public:
42 
51  CHashedDocConverter(int32_t hash_bits, bool normalize = false, int32_t n_grams = 1, int32_t skips = 0);
52 
61  CHashedDocConverter(CTokenizer* tzer, int32_t hash_bits, bool normalize = false, int32_t n_grams = 1,
62  int32_t skips = 0);
63 
65  virtual ~CHashedDocConverter();
66 
72  virtual CFeatures* apply(CFeatures* features);
73 
80 
91  static void generate_ngram_hashes(CDynamicArray<uint32_t>* hashes, CDynamicArray<index_t>* ngram_hashes,
92  int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip);
93 
95  virtual const char* get_name() const;
96 
101  void set_normalization(bool normalize);
102 
110  void set_k_skip_n_grams(int32_t k, int32_t n);
111 protected:
112 
114  void init(CTokenizer* tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips);
115 
122  int32_t count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices);
123 
131 
132 protected:
133 
135  int32_t num_bits;
136 
139 
142 
144  int32_t ngrams;
145 
147  int32_t tokens_to_skip;
148 };
149 }
150 
151 #endif

SHOGUN Machine Learning Toolbox - Documentation