SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedDocDotFeatures.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
11 #ifndef _HASHEDDOCDOTFEATURES__H__
12 #define _HASHEDDOCDOTFEATURES__H__
13 
14 #include <shogun/lib/config.h>
15 
19 #include <shogun/lib/Tokenizer.h>
20 
21 namespace shogun {
22 template<class ST> class CStringFeatures;
23 template<class ST> class SGMatrix;
24 class CDotFeatures;
25 class CHashedDocConverter;
26 class CTokenizer;
27 
39 {
40 public:
41 
52  CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures<char>* docs=NULL,
53  CTokenizer* tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0);
54 
57 
63 
65  virtual ~CHashedDocDotFeatures();
66 
74  virtual int32_t get_dim_feature_space() const;
75 
83  virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2);
84 
90  virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2);
91 
98  virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len);
99 
108  virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val=false);
109 
117  virtual int32_t get_nnz_features_for_vector(int32_t num);
118 
129  virtual void* get_feature_iterator(int32_t vector_index);
130 
142  virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator);
143 
150  virtual void free_feature_iterator(void* iterator);
151 
157 
158  virtual const char* get_name() const;
159 
164  virtual CFeatures* duplicate() const;
165 
170  virtual EFeatureType get_feature_type() const;
171 
176  virtual EFeatureClass get_feature_class() const;
177 
182  virtual int32_t get_num_vectors() const;
183 
192  static uint32_t calculate_token_hash(char* token, int32_t length,
193  int32_t num_bits, uint32_t seed);
194 
195 private:
196  void init(int32_t hash_bits, CStringFeatures<char>* docs, CTokenizer* tzer,
197  bool normalize, int32_t n_grams, int32_t skips);
198 
199 protected:
202 
204  int32_t num_bits;
205 
208 
211 
213  int32_t ngrams;
214 
216  int32_t tokens_to_skip;
217 };
218 }
219 
220 #endif
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
virtual void free_feature_iterator(void *iterator)
virtual EFeatureClass get_feature_class() const
virtual int32_t get_dim_feature_space() const
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
virtual void * get_feature_iterator(int32_t vector_index)
virtual const char * get_name() const
Features that support dot products among other operations.
Definition: DotFeatures.h:44
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector< float64_t > vec2)
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
CStringFeatures< char > * doc_collection
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
double float64_t
Definition: common.h:50
A File access base class.
Definition: File.h:34
virtual CFeatures * duplicate() const
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
static uint32_t calculate_token_hash(char *token, int32_t length, int32_t num_bits, uint32_t seed)
void set_doc_collection(CStringFeatures< char > *docs)
virtual int32_t get_num_vectors() const
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures< char > *docs=NULL, CTokenizer *tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual EFeatureType get_feature_type() const
This class can be used to provide on-the-fly vectorization of a document collection. Like in the standard Bag-of-Words representation, this class considers each document as a collection of tokens, which are then hashed into a new feature space of a specified dimension. This class is very flexible and allows the user to specify the tokenizer used to tokenize each document, specify whether the results should be normalized with regards to the sqrt of the document size, as well as to specify whether he wants to combine different tokens. The latter implements a k-skip n-grams approach, meaning that you can combine up to n tokens, while skipping up to k. Eg. for the tokens ["a", "b", "c", "d"], with n_grams = 2 and skips = 2, one would get the following combinations : ["a", "ab", "ac" (skipped 1), "ad" (skipped 2), "b", "bc", "bd" (skipped 1), "c", "cd", "d"].
virtual int32_t get_nnz_features_for_vector(int32_t num)

SHOGUN Machine Learning Toolbox - Documentation