SHOGUN
v3.0.0
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
lib
NGramTokenizer.cpp
Go to the documentation of this file.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#include <
shogun/lib/NGramTokenizer.h
>
12
#include <
shogun/base/Parameter.h
>
13
14
namespace
shogun
15
{
16
17
CNGramTokenizer::CNGramTokenizer
(int32_t ns) :
CTokenizer
()
18
{
19
n
= ns;
20
last_idx
= 0;
21
init();
22
}
23
24
CNGramTokenizer::CNGramTokenizer
(
const
CNGramTokenizer
& orig)
25
:
CTokenizer
(orig)
26
{
27
CTokenizer::set_text
(orig.
text
);
28
n
= orig.
n
;
29
init();
30
}
31
32
void
CNGramTokenizer::init()
33
{
34
SG_ADD
(&
n
,
"n"
,
"Size of n-grams"
,
35
MS_NOT_AVAILABLE
);
36
SG_ADD
(&
last_idx
,
"last_idx"
,
"Index of last token"
,
37
MS_NOT_AVAILABLE
);
38
}
39
40
void
CNGramTokenizer::set_text
(
SGVector<char>
txt)
41
{
42
last_idx
= 0;
43
CTokenizer::set_text
(txt);
44
}
45
46
const
char
*
CNGramTokenizer::get_name
()
const
47
{
48
return
"NGramTokenizer"
;
49
}
50
51
bool
CNGramTokenizer::has_next
()
52
{
53
return
last_idx
<=
text
.
size
()-
n
;
54
}
55
56
index_t
CNGramTokenizer::next_token_idx
(
index_t
& start)
57
{
58
start =
last_idx
++;
59
return
start +
n
;
60
}
61
62
CNGramTokenizer
*
CNGramTokenizer::get_copy
()
63
{
64
CNGramTokenizer
* t =
new
CNGramTokenizer
(
n
);
65
return
t;
66
}
67
}
SHOGUN
Machine Learning Toolbox - Documentation