SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
LibSVMFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/LibSVMFile.h>
11 
12 #include <shogun/lib/SGVector.h>
14 #include <shogun/base/DynArray.h>
15 
16 using namespace shogun;
17 
19 {
20  init();
21 }
22 
23 CLibSVMFile::CLibSVMFile(FILE* f, const char* name) :
24  CFile(f, name)
25 {
26  init();
27  init_with_defaults();
28 }
29 
30 CLibSVMFile::CLibSVMFile(const char* fname, char rw, const char* name) :
31  CFile(fname, rw, name)
32 {
33  init();
34  init_with_defaults();
35 }
36 
38 {
39  SG_UNREF(m_whitespace_tokenizer);
40  SG_UNREF(m_delimiter_tokenizer);
41  SG_UNREF(m_line_tokenizer);
42  SG_UNREF(m_parser);
43  SG_UNREF(m_line_reader);
44 }
45 
46 void CLibSVMFile::init()
47 {
48  m_delimiter=0;
49 
50  m_whitespace_tokenizer=NULL;
51  m_delimiter_tokenizer=NULL;
52  m_line_tokenizer=NULL;
53  m_parser=NULL;
54  m_line_reader=NULL;
55 }
56 
57 void CLibSVMFile::init_with_defaults()
58 {
59  m_delimiter=':';
60 
61  m_whitespace_tokenizer=new CDelimiterTokenizer(true);
62  m_whitespace_tokenizer->delimiters[' ']=1;
63  SG_REF(m_whitespace_tokenizer);
64 
65  m_delimiter_tokenizer=new CDelimiterTokenizer(true);
66  m_delimiter_tokenizer->delimiters[m_delimiter]=1;
67  SG_REF(m_delimiter_tokenizer);
68 
69  m_line_tokenizer=new CDelimiterTokenizer(true);
70  m_line_tokenizer->delimiters['\n']=1;
71  SG_REF(m_line_tokenizer);
72 
73  m_parser=new CParser();
74  m_line_reader=new CLineReader(file, m_line_tokenizer);
75 }
76 
77 #define GET_SPARSE_MATRIX(read_func, sg_type) \
78 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
79 { \
80  float64_t* labels=NULL; \
81  get_sparse_matrix(matrix, num_feat, num_vec, labels, false); \
82 }
83 
84 GET_SPARSE_MATRIX(read_bool, bool)
85 GET_SPARSE_MATRIX(read_char, int8_t)
86 GET_SPARSE_MATRIX(read_byte, uint8_t)
87 GET_SPARSE_MATRIX(read_char, char)
88 GET_SPARSE_MATRIX(read_int, int32_t)
89 GET_SPARSE_MATRIX(read_uint, uint32_t)
90 GET_SPARSE_MATRIX(read_short_real, float32_t)
91 GET_SPARSE_MATRIX(read_real, float64_t)
92 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
93 GET_SPARSE_MATRIX(read_short, int16_t)
94 GET_SPARSE_MATRIX(read_word, uint16_t)
95 GET_SPARSE_MATRIX(read_long, int64_t)
96 GET_SPARSE_MATRIX(read_ulong, uint64_t)
97 #undef GET_SPARSE_MATRIX
98 
99 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \
100 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec, \
101  float64_t*& labels, bool load_labels) \
102 { \
103  num_feat=0; \
104  \
105  SG_INFO("counting line numbers in file %s\n", filename) \
106  num_vec=get_num_lines(); \
107  \
108  int32_t current_line_ind=0; \
109  SGVector<char> line; \
110  \
111  int32_t num_entries=0; \
112  DynArray<SGVector<char> > entries; \
113  \
114  matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
115  if (load_labels) \
116  labels=SG_MALLOC(float64_t, num_vec); \
117  \
118  SG_SET_LOCALE_C; \
119  \
120  while (m_line_reader->has_next()) \
121  { \
122  num_entries=0; \
123  entries.reset(SGVector<char>(false)); \
124  line=m_line_reader->read_line(); \
125  \
126  m_parser->set_tokenizer(m_whitespace_tokenizer); \
127  m_parser->set_text(line); \
128  \
129  if (load_labels && m_parser->has_next()) \
130  labels[current_line_ind]=m_parser->read_real(); \
131  \
132  while (m_parser->has_next()) \
133  { \
134  entries.push_back(m_parser->read_string()); \
135  num_entries++; \
136  } \
137  \
138  matrix[current_line_ind]=SGSparseVector<sg_type>(num_entries); \
139  for (int32_t i=0; i<num_entries; i++) \
140  { \
141  m_parser->set_tokenizer(m_delimiter_tokenizer); \
142  m_parser->set_text(entries[i]); \
143  \
144  int32_t feat_index=0; \
145  if (m_parser->has_next()) \
146  feat_index=m_parser->read_int(); \
147  \
148  sg_type entry=0; \
149  if (m_parser->has_next()) \
150  entry=m_parser->read_func(); \
151  \
152  if (feat_index>num_feat) \
153  num_feat=feat_index; \
154  \
155  matrix[current_line_ind].features[i].feat_index=feat_index-1; \
156  matrix[current_line_ind].features[i].entry=entry; \
157  } \
158  \
159  current_line_ind++; \
160  SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \
161  } \
162  \
163  SG_RESET_LOCALE; \
164  \
165  SG_INFO("file successfully read\n") \
166 }
167 
168 GET_LABELED_SPARSE_MATRIX(read_bool, bool)
169 GET_LABELED_SPARSE_MATRIX(read_char, int8_t)
170 GET_LABELED_SPARSE_MATRIX(read_byte, uint8_t)
171 GET_LABELED_SPARSE_MATRIX(read_char, char)
172 GET_LABELED_SPARSE_MATRIX(read_int, int32_t)
173 GET_LABELED_SPARSE_MATRIX(read_uint, uint32_t)
174 GET_LABELED_SPARSE_MATRIX(read_short_real, float32_t)
176 GET_LABELED_SPARSE_MATRIX(read_long_real, floatmax_t)
177 GET_LABELED_SPARSE_MATRIX(read_short, int16_t)
178 GET_LABELED_SPARSE_MATRIX(read_word, uint16_t)
179 GET_LABELED_SPARSE_MATRIX(read_long, int64_t)
180 GET_LABELED_SPARSE_MATRIX(read_ulong, uint64_t)
181 #undef GET_LABELED_SPARSE_MATRIX
182 
183 #define SET_SPARSE_MATRIX(format, sg_type) \
184 void CLibSVMFile::set_sparse_matrix( \
185  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
186 { \
187  set_sparse_matrix(matrix, num_feat, num_vec, NULL); \
188 }
189 
190 SET_SPARSE_MATRIX(SCNi32, bool)
191 SET_SPARSE_MATRIX(SCNi8, int8_t)
192 SET_SPARSE_MATRIX(SCNu8, uint8_t)
193 SET_SPARSE_MATRIX(SCNu8, char)
194 SET_SPARSE_MATRIX(SCNi32, int32_t)
195 SET_SPARSE_MATRIX(SCNu32, uint32_t)
196 SET_SPARSE_MATRIX(SCNi64, int64_t)
197 SET_SPARSE_MATRIX(SCNu64, uint64_t)
201 SET_SPARSE_MATRIX(SCNi16, int16_t)
202 SET_SPARSE_MATRIX(SCNu16, uint16_t)
203 #undef SET_SPARSE_MATRIX
204 
205 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \
206 void CLibSVMFile::set_sparse_matrix( \
207  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \
208  const float64_t* labels) \
209 { \
210  SG_SET_LOCALE_C; \
211  \
212  for (int32_t i=0; i<num_vec; i++) \
213  { \
214  if (labels!=NULL) \
215  fprintf(file, "%lg ", labels[i]); \
216  \
217  for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \
218  { \
219  fprintf(file, "%d%c%" format " ", \
220  matrix[i].features[j].feat_index+1, \
221  m_delimiter, \
222  matrix[i].features[j].entry); \
223  } \
224  fprintf(file, "\n"); \
225  } \
226  \
227  SG_RESET_LOCALE; \
228 }
229 
230 SET_LABELED_SPARSE_MATRIX(SCNi32, bool)
231 SET_LABELED_SPARSE_MATRIX(SCNi8, int8_t)
232 SET_LABELED_SPARSE_MATRIX(SCNu8, uint8_t)
233 SET_LABELED_SPARSE_MATRIX(SCNu8, char)
234 SET_LABELED_SPARSE_MATRIX(SCNi32, int32_t)
235 SET_LABELED_SPARSE_MATRIX(SCNu32, uint32_t)
236 SET_LABELED_SPARSE_MATRIX(SCNi64, int64_t)
237 SET_LABELED_SPARSE_MATRIX(SCNu64, uint64_t)
241 SET_LABELED_SPARSE_MATRIX(SCNi16, int16_t)
242 SET_LABELED_SPARSE_MATRIX(SCNu16, uint16_t)
243 #undef SET_LABELED_SPARSE_MATRIX
244 
245 int32_t CLibSVMFile::get_num_lines()
246 {
247  int32_t num_lines=0;
248  while (m_line_reader->has_next())
249  {
250  m_line_reader->skip_line();
251  num_lines++;
252  }
253  m_line_reader->reset();
254 
255  return num_lines;
256 }

SHOGUN Machine Learning Toolbox - Documentation