SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
StreamingHashedDocDotFeatures.h
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 #ifndef _STREAMING_HASHEDDOCDOTFEATURES__H__
11 #define _STREAMING_HASHEDDOCDOTFEATURES__H__
12 
13 #include <shogun/lib/config.h>
14 
17 #include <shogun/lib/Tokenizer.h>
21 
22 namespace shogun
23 {
24 class CStreamingDotFeatures;
25 class CTokenizer;
26 class CHashedDocConverter;
27 
43 {
44 public:
47 
59  CStreamingHashedDocDotFeatures(CStreamingFile* file, bool is_labelled, int32_t size,
60  CTokenizer* tzer, int32_t bits=20);
61 
79  int32_t bits=20, float64_t* lab=NULL);
80 
83 
90  virtual float32_t dot(CStreamingDotFeatures* df);
91 
97  virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len);
98 
106  virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2,
107  int32_t vec2_len, bool abs_val=false);
108 
116  virtual int32_t get_dim_feature_space() const;
117 
123  virtual const char* get_name() const;
124 
130  virtual int32_t get_num_vectors() const;
131 
137  virtual CFeatures* duplicate() const;
138 
148  virtual void set_vector_reader();
149 
159  virtual void set_vector_and_label_reader();
160 
166  virtual EFeatureType get_feature_type() const;
167 
173  virtual EFeatureClass get_feature_class() const;
174 
179  virtual void start_parser();
180 
184  virtual void end_parser();
185 
193  virtual float64_t get_label();
194 
200  virtual bool get_next_example();
201 
207  virtual void release_example();
208 
214  virtual int32_t get_num_features();
215 
221 
226  void set_normalization(bool normalize);
227 
235  void set_k_skip_n_grams(int32_t k, int32_t n);
236 
237 private:
238  void init(CStreamingFile* file, bool is_labelled, int32_t size, CTokenizer* tzer,
239  int32_t bits, bool normalize, int32_t n_grams, int32_t skips);
240 
241 protected:
242 
244  int32_t num_bits;
245 
248 
251 
254 
256  CInputParser<char> parser;
257 
260 };
261 }
262 
263 #endif // _STREAMING_HASHEDDOCDOTFEATURES__H__
This class implements streaming features for a document collection. Like in the standard Bag-of-Words...
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
A Streaming File access class.
Definition: StreamingFile.h:34
virtual void add_to_dense_vec(float32_t alpha, float32_t *vec2, int32_t vec2_len, bool abs_val=false)
This class can be used to convert a document collection contained in a CStringFeatures object w...
virtual float32_t dense_dot(const float32_t *vec2, int32_t vec2_len)
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29
double float64_t
Definition: common.h:50
Streaming features that support dot products among other operations.
float float32_t
Definition: common.h:49
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual float32_t dot(CStreamingDotFeatures *df)

SHOGUN 机器学习工具包 - 项目文档