SHOGUN
v3.0.0
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
converter
HashedDocConverter.h
Go to the documentation of this file.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#ifndef _HASHEDDOCCONVERTER__H__
12
#define _HASHEDDOCCONVERTER__H__
13
14
#include <
shogun/converter/Converter.h
>
15
#include <
shogun/features/Features.h
>
16
#include <
shogun/lib/Tokenizer.h
>
17
#include <
shogun/features/SparseFeatures.h
>
18
19
namespace
shogun
20
{
21
class
CFeatures;
22
class
CTokenizer;
23
class
CConverter;
24
template
<
class
T>
class
CSparseFeatures;
25
37
class
CHashedDocConverter
:
public
CConverter
38
{
39
public
:
41
CHashedDocConverter
();
42
51
CHashedDocConverter
(int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1, int32_t skips = 0);
52
61
CHashedDocConverter
(
CTokenizer
* tzer, int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1,
62
int32_t skips = 0);
63
65
virtual
~CHashedDocConverter
();
66
72
virtual
CFeatures
*
apply
(
CFeatures
* features);
73
79
SGSparseVector<float64_t>
apply
(
SGVector<char>
document);
80
91
static
void
generate_ngram_hashes
(
CDynamicArray<uint32_t>
* hashes,
CDynamicArray<index_t>
* ngram_hashes,
92
int32_t
num_bits
, int32_t
ngrams
, int32_t
tokens_to_skip
);
93
95
virtual
const
char
*
get_name
()
const
;
96
101
void
set_normalization
(
bool
normalize);
102
110
void
set_k_skip_n_grams
(int32_t k, int32_t n);
111
protected
:
112
114
void
init
(
CTokenizer
* tzer, int32_t d,
bool
normalize, int32_t n_grams, int32_t skips);
115
122
int32_t
count_distinct_indices
(
CDynamicArray<uint32_t>
& hashed_indices);
123
130
SGSparseVector<float64_t>
create_hashed_representation
(
CDynamicArray<uint32_t>
& hashed_indices);
131
132
protected
:
133
135
int32_t
num_bits
;
136
138
CTokenizer
*
tokenizer
;
139
141
bool
should_normalize
;
142
144
int32_t
ngrams
;
145
147
int32_t
tokens_to_skip
;
148
};
149
}
150
151
#endif
SHOGUN
Machine Learning Toolbox - Documentation