SHOGUN
v3.0.0
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
lib
DelimiterTokenizer.cpp
Go to the documentation of this file.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#include <
shogun/base/Parameter.h
>
12
#include <
shogun/lib/DelimiterTokenizer.h
>
13
14
namespace
shogun
15
{
16
17
CDelimiterTokenizer::CDelimiterTokenizer
(
bool
skip_delimiters) : delimiters(256)
18
{
19
last_idx
= 0;
20
skip_consecutive_delimiters
= skip_delimiters;
21
init();
22
}
23
24
CDelimiterTokenizer::CDelimiterTokenizer
(
const
CDelimiterTokenizer
& orig)
25
{
26
CTokenizer::set_text
(orig.
text
);
27
delimiters
= orig.
delimiters
;
28
init();
29
}
30
31
void
CDelimiterTokenizer::init()
32
{
33
SG_ADD
(&
last_idx
,
"last_idx"
,
"Index of last token"
,
34
MS_NOT_AVAILABLE
);
35
SG_ADD
(&
skip_consecutive_delimiters
,
"skip_consecutive_delimiters"
,
36
"Whether to skip consecutive delimiters or not"
,
MS_NOT_AVAILABLE
);
37
SGVector<bool>::fill_vector
(
delimiters
, 256, 0);
38
}
39
40
void
CDelimiterTokenizer::set_text
(
SGVector<char>
txt)
41
{
42
last_idx
= 0;
43
CTokenizer::set_text
(txt);
44
}
45
46
const
char
*
CDelimiterTokenizer::get_name
()
const
47
{
48
return
"DelimiterTokenizer"
;
49
}
50
51
bool
CDelimiterTokenizer::has_next
()
52
{
53
if
(
skip_consecutive_delimiters
)
54
{
55
for
(
index_t
i=
last_idx
; i<
text
.
size
(); i++)
56
{
57
if
(!
delimiters
[(uint8_t)
text
[i]])
58
return
true
;
59
}
60
return
false
;
61
}
62
else
63
return
last_idx
<
text
.
size
();
64
}
65
66
void
CDelimiterTokenizer::init_for_whitespace
()
67
{
68
clear_delimiters
();
69
delimiters
[
' '
] = 1;
70
delimiters
[
'\t'
] = 1;
71
}
72
73
void
CDelimiterTokenizer::clear_delimiters
()
74
{
75
memset(
delimiters
, 0,
sizeof
(
delimiters
));
76
}
77
78
index_t
CDelimiterTokenizer::next_token_idx
(
index_t
& start)
79
{
80
start =
last_idx
;
81
82
if
(
skip_consecutive_delimiters
)
83
{
84
while
(
delimiters
[(uint8_t)
text
[start]])
85
start++;
86
}
87
88
if
(!
delimiters
[(uint8_t)
text
[start]])
89
{
90
for
(
last_idx
=start+1;
last_idx
<
text
.
size
();
last_idx
++)
91
{
92
if
(
delimiters
[(uint8_t)
text
[
last_idx
]])
93
break
;
94
}
95
}
96
97
return
last_idx
++;
98
}
99
100
CDelimiterTokenizer
*
CDelimiterTokenizer::get_copy
()
101
{
102
CDelimiterTokenizer
* t =
new
CDelimiterTokenizer
();
103
t->
delimiters
=
delimiters
;
104
t->
skip_consecutive_delimiters
=
skip_consecutive_delimiters
;
105
return
t;
106
}
107
108
void
CDelimiterTokenizer::set_skip_delimiters
(
bool
skip_delimiters)
109
{
110
skip_consecutive_delimiters
= skip_delimiters;
111
}
112
113
bool
CDelimiterTokenizer::get_skip_delimiters
()
const
114
{
115
return
skip_consecutive_delimiters
;
116
}
117
}
SHOGUN
Machine Learning Toolbox - Documentation