SHOGUN
v2.0.0
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
features
StringFileFeatures.cpp
Go to the documentation of this file.
1
#include <
shogun/features/StringFileFeatures.h
>
2
3
namespace
shogun
4
{
5
6
template
<
class
ST>
CStringFileFeatures<ST>::CStringFileFeatures
() :
CStringFeatures
<ST>(), file(NULL)
7
{
8
}
9
10
template
<
class
ST>
CStringFileFeatures<ST>::CStringFileFeatures
(
const
char
* fname,
EAlphabet
alpha)
11
:
CStringFeatures
<ST>(alpha)
12
{
13
file
=
new
CMemoryMappedFile<ST>
(fname);
14
fetch_meta_info_from_file
();
15
}
16
17
template
<
class
ST>
CStringFileFeatures<ST>::~CStringFileFeatures
()
18
{
19
SG_UNREF
(file);
20
CStringFileFeatures<ST>::cleanup
();
21
}
22
23
template
<
class
ST> ST*
CStringFileFeatures<ST>::get_line
(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length)
24
{
25
ST* s = file->get_map();
26
for
(uint64_t i=offs; i<file_length; i++)
27
{
28
ST c=s[i];
29
30
if
(c ==
'\n'
)
31
{
32
ST* line=&s[offs];
33
len=i-offs;
34
offs=i+1;
35
line_nr++;
36
return
line;
37
}
38
else
39
{
40
if
(!
CStringFeatures<ST>::alphabet
->is_valid((uint8_t) c))
41
{
42
CStringFileFeatures<ST>::cleanup
();
43
CStringFeatures<ST>::SG_ERROR
(
"Invalid character (%c) in line %d\n"
, c, line_nr);
44
}
45
}
46
}
47
48
len=0;
49
offs=file_length;
50
return
NULL;
51
}
52
53
template
<
class
ST>
void
CStringFileFeatures<ST>::cleanup
()
54
{
55
CStringFeatures<ST>::num_vectors
=0;
56
SG_FREE
(
CStringFeatures<ST>::features
);
57
SG_FREE
(
CStringFeatures<ST>::symbol_mask_table
);
58
CStringFeatures<ST>::features
=NULL;
59
CStringFeatures<ST>::symbol_mask_table
=NULL;
60
61
/* start with a fresh alphabet, but instead of emptying the histogram
62
* create a new object (to leave the alphabet object alone if it is used
63
* by others)
64
*/
65
CAlphabet
* alpha=
new
CAlphabet
(
CStringFeatures<ST>::alphabet
->get_alphabet());
66
SG_UNREF
(
CStringFeatures<ST>::alphabet
);
67
CStringFeatures<ST>::alphabet
=alpha;
68
SG_REF
(
CStringFeatures<ST>::alphabet
);
69
}
70
71
template
<
class
ST>
void
CStringFileFeatures<ST>::cleanup_feature_vector
(int32_t num)
72
{
73
CStringFeatures<ST>::SG_ERROR
(
"Cleaning single feature vector not"
74
"supported by StringFileFeatures\n"
);
75
}
76
77
template
<
class
ST>
void
CStringFileFeatures<ST>::fetch_meta_info_from_file
(int32_t granularity)
78
{
79
CStringFileFeatures<ST>::cleanup
();
80
uint64_t file_size=file->get_size();
81
ASSERT
(granularity>=1);
82
ASSERT
(
CStringFeatures<ST>::alphabet
);
83
84
int64_t buffer_size=granularity;
85
CStringFeatures<ST>::features
=
SG_MALLOC
(
SGString<ST>
, buffer_size);
86
87
uint64_t offs=0;
88
uint64_t len=0;
89
CStringFeatures<ST>::max_string_length
=0;
90
CStringFeatures<ST>::num_vectors
=0;
91
92
while
(
true
)
93
{
94
ST* line=get_line(len, offs,
CStringFeatures<ST>::num_vectors
, file_size);
95
96
if
(line)
97
{
98
if
(
CStringFeatures<ST>::num_vectors
> buffer_size)
99
{
100
CStringFeatures<ST>::features
=
SG_REALLOC
(
SGString<ST>
,
CStringFeatures<ST>::features
, buffer_size+granularity);
101
buffer_size+=granularity;
102
}
103
104
CStringFeatures<ST>::features
[
CStringFeatures<ST>::num_vectors
-1].string=line;
105
CStringFeatures<ST>::features
[
CStringFeatures<ST>::num_vectors
-1].slen=len;
106
CStringFeatures<ST>::max_string_length
=
CMath::max
(
CStringFeatures<ST>::max_string_length
, (int32_t) len);
107
}
108
else
109
break
;
110
}
111
112
CStringFeatures<ST>::SG_INFO
(
"number of strings:%d\n"
,
CStringFeatures<ST>::num_vectors
);
113
CStringFeatures<ST>::SG_INFO
(
"maximum string length:%d\n"
,
CStringFeatures<ST>::max_string_length
);
114
CStringFeatures<ST>::SG_INFO
(
"max_value_in_histogram:%d\n"
,
CStringFeatures<ST>::alphabet
->get_max_value_in_histogram());
115
CStringFeatures<ST>::SG_INFO
(
"num_symbols_in_histogram:%d\n"
,
CStringFeatures<ST>::alphabet
->get_num_symbols_in_histogram());
116
117
if
(!
CStringFeatures<ST>::alphabet
->check_alphabet_size() || !
CStringFeatures<ST>::alphabet
->check_alphabet())
118
CStringFileFeatures<ST>::cleanup
();
119
120
CStringFeatures<ST>::features
=
SG_REALLOC
(
SGString<ST>
,
CStringFeatures<ST>::features
,
CStringFeatures<ST>::num_vectors
);
121
}
122
123
template
class
CStringFileFeatures<bool>
;
124
template
class
CStringFileFeatures<char>
;
125
template
class
CStringFileFeatures<int8_t>
;
126
template
class
CStringFileFeatures<uint8_t>
;
127
template
class
CStringFileFeatures<int16_t>
;
128
template
class
CStringFileFeatures<uint16_t>
;
129
template
class
CStringFileFeatures<int32_t>
;
130
template
class
CStringFileFeatures<uint32_t>
;
131
template
class
CStringFileFeatures<int64_t>
;
132
template
class
CStringFileFeatures<uint64_t>
;
133
template
class
CStringFileFeatures<float32_t>
;
134
template
class
CStringFileFeatures<float64_t>
;
135
template
class
CStringFileFeatures<floatmax_t>
;
136
}
SHOGUN
Machine Learning Toolbox - Documentation