SHOGUN  4.1.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
CSVFile.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/CSVFile.h>
11 
12 #include <shogun/io/SGIO.h>
13 #include <shogun/lib/SGVector.h>
14 #include <shogun/io/LineReader.h>
15 #include <shogun/io/Parser.h>
17 
18 using namespace shogun;
19 
21 {
22  init();
23 }
24 
25 CCSVFile::CCSVFile(FILE* f, const char* name) :
26  CFile(f, name)
27 {
28  init();
29  init_with_defaults();
30 }
31 
32 #ifdef HAVE_FDOPEN
33 CCSVFile::CCSVFile(int fd, const char* mode, const char* name) :
34  CFile(fd, mode, name)
35 {
36  init();
37  init_with_defaults();
38 }
39 #endif
40 
41 CCSVFile::CCSVFile(const char* fname, char rw, const char* name) :
42  CFile(fname, rw, name)
43 {
44  init();
45  init_with_defaults();
46 }
47 
49 {
50  SG_UNREF(m_tokenizer);
51  SG_UNREF(m_line_tokenizer);
52  SG_UNREF(m_parser);
53  SG_UNREF(m_line_reader);
54 }
55 
56 void CCSVFile::set_transpose(bool value)
57 {
58  is_data_transposed=value;
59 }
60 
61 void CCSVFile::set_delimiter(char delimiter)
62 {
63  m_tokenizer->delimiters[m_delimiter]=0;
64 
65  m_delimiter=delimiter;
66  m_tokenizer->delimiters[m_delimiter]=1;
67 
68  m_tokenizer->delimiters[' ']=1;
69 }
70 
71 void CCSVFile::set_lines_to_skip(int32_t num_lines)
72 {
73  m_num_to_skip=num_lines;
74 }
75 
76 int32_t CCSVFile::get_stats(int32_t& num_tokens)
77 {
78  int32_t num_lines=0;
79  num_tokens=-1;
80 
81  while (m_line_reader->has_next())
82  {
83  if (num_tokens==-1)
84  {
85  SGVector<char> line=m_line_reader->read_line();
86  m_tokenizer->set_text(line);
87 
88  num_tokens=0;
89  while (m_tokenizer->has_next())
90  {
91  index_t temp_start=0;
92  m_tokenizer->next_token_idx(temp_start);
93  num_tokens++;
94  }
95  }
96  else
97  m_line_reader->skip_line();
98  num_lines++;
99  }
100  m_line_reader->reset();
101 
102  return num_lines;
103 }
104 
105 void CCSVFile::init()
106 {
107  is_data_transposed=false;
108  m_delimiter=0;
109  m_num_to_skip=0;
110 
111  m_tokenizer=NULL;
112  m_line_tokenizer=NULL;
113  m_parser=NULL;
114  m_line_reader=NULL;
115 }
116 
117 void CCSVFile::init_with_defaults()
118 {
119  is_data_transposed=false;
120  m_delimiter=',';
121 
122  m_tokenizer=new CDelimiterTokenizer(true);
123  m_tokenizer->delimiters[m_delimiter]=1;
124  m_tokenizer->delimiters[' ']=1;
125  SG_REF(m_tokenizer);
126 
127  m_line_tokenizer=new CDelimiterTokenizer(true);
128  m_line_tokenizer->delimiters['\n']=1;
129  SG_REF(m_line_tokenizer);
130 
131  m_parser=new CParser();
132  m_parser->set_tokenizer(m_tokenizer);
133 
134  m_line_reader=new CLineReader(file, m_line_tokenizer);
135 }
136 
137 void CCSVFile::skip_lines(int32_t num_lines)
138 {
139  for (int32_t i=0; i<num_lines; i++)
140  m_line_reader->skip_line();
141 }
142 
143 #define GET_VECTOR(read_func, sg_type) \
144 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \
145 { \
146  if (!m_line_reader->has_next()) \
147  return; \
148  \
149  int32_t num_feat=0; \
150  int32_t num_vec=0; \
151  get_matrix(vector, num_feat, num_vec); \
152  \
153  if (num_feat==1) \
154  { \
155  len=num_vec; \
156  return; \
157  } \
158  \
159  if (num_vec==1) \
160  { \
161  len=num_feat; \
162  return; \
163  } \
164  \
165  len=0; \
166 }
167 
168 GET_VECTOR(read_char, int8_t)
169 GET_VECTOR(read_byte, uint8_t)
170 GET_VECTOR(read_char, char)
171 GET_VECTOR(read_int, int32_t)
172 GET_VECTOR(read_uint, uint32_t)
173 GET_VECTOR(read_short_real, float32_t)
174 GET_VECTOR(read_real, float64_t)
175 GET_VECTOR(read_long_real, floatmax_t)
176 GET_VECTOR(read_short, int16_t)
177 GET_VECTOR(read_word, uint16_t)
178 GET_VECTOR(read_long, int64_t)
179 GET_VECTOR(read_ulong, uint64_t)
180 #undef GET_VECTOR
181 
182 #define GET_MATRIX(read_func, sg_type) \
183 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
184 { \
185  int32_t num_lines=0; \
186  int32_t num_tokens=-1; \
187  int32_t current_line_idx=0; \
188  SGVector<char> line; \
189  \
190  skip_lines(m_num_to_skip); \
191  num_lines=get_stats(num_tokens); \
192  \
193  SG_SET_LOCALE_C; \
194  \
195  matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \
196  skip_lines(m_num_to_skip); \
197  while (m_line_reader->has_next()) \
198  { \
199  line=m_line_reader->read_line(); \
200  m_parser->set_text(line); \
201  \
202  for (int32_t i=0; i<num_tokens; i++) \
203  { \
204  if (!m_parser->has_next()) \
205  return; \
206  \
207  if (!is_data_transposed) \
208  matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \
209  else \
210  matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \
211  } \
212  current_line_idx++; \
213  } \
214  \
215  SG_RESET_LOCALE; \
216  \
217  if (!is_data_transposed) \
218  { \
219  num_feat=num_tokens; \
220  num_vec=num_lines; \
221  } \
222  else \
223  { \
224  num_feat=num_lines; \
225  num_vec=num_tokens; \
226  } \
227 }
228 
229 GET_MATRIX(read_char, int8_t)
230 GET_MATRIX(read_byte, uint8_t)
231 GET_MATRIX(read_char, char)
232 GET_MATRIX(read_int, int32_t)
233 GET_MATRIX(read_uint, uint32_t)
234 GET_MATRIX(read_short_real, float32_t)
235 GET_MATRIX(read_real, float64_t)
236 GET_MATRIX(read_long_real, floatmax_t)
237 GET_MATRIX(read_short, int16_t)
238 GET_MATRIX(read_word, uint16_t)
239 GET_MATRIX(read_long, int64_t)
240 GET_MATRIX(read_ulong, uint64_t)
241 #undef GET_MATRIX
242 
243 #define GET_NDARRAY(read_func, sg_type) \
244 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \
245 { \
246  SG_NOTIMPLEMENTED \
247 }
248 
249 GET_NDARRAY(read_byte, uint8_t)
250 GET_NDARRAY(read_char, char)
251 GET_NDARRAY(read_int, int32_t)
252 GET_NDARRAY(read_short_real, float32_t)
253 GET_NDARRAY(read_real, float64_t)
254 GET_NDARRAY(read_short, int16_t)
255 GET_NDARRAY(read_word, uint16_t)
256 #undef GET_NDARRAY
257 
258 #define GET_SPARSE_MATRIX(read_func, sg_type) \
259 void CCSVFile::get_sparse_matrix( \
260  SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
261 { \
262  SG_NOTIMPLEMENTED \
263 }
264 
265 GET_SPARSE_MATRIX(read_char, bool)
266 GET_SPARSE_MATRIX(read_char, int8_t)
267 GET_SPARSE_MATRIX(read_byte, uint8_t)
268 GET_SPARSE_MATRIX(read_char, char)
269 GET_SPARSE_MATRIX(read_int, int32_t)
270 GET_SPARSE_MATRIX(read_uint, uint32_t)
271 GET_SPARSE_MATRIX(read_short_real, float32_t)
272 GET_SPARSE_MATRIX(read_real, float64_t)
273 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
274 GET_SPARSE_MATRIX(read_short, int16_t)
275 GET_SPARSE_MATRIX(read_word, uint16_t)
276 GET_SPARSE_MATRIX(read_long, int64_t)
277 GET_SPARSE_MATRIX(read_ulong, uint64_t)
278 #undef GET_SPARSE_MATRIX
279 
280 #define SET_VECTOR(format, sg_type) \
281 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \
282 { \
283  SG_SET_LOCALE_C; \
284  \
285  if (!is_data_transposed) \
286  { \
287  for (int32_t i=0; i<len; i++) \
288  fprintf(file, "%" format "\n", vector[i]); \
289  } \
290  else \
291  { \
292  int32_t i; \
293  for (i=0; i<len-1; i++) \
294  fprintf(file, "%" format "%c", vector[i], m_delimiter); \
295  fprintf(file, "%" format "\n", vector[i]); \
296  } \
297  \
298  SG_RESET_LOCALE; \
299 }
300 
301 SET_VECTOR(SCNi8, int8_t)
302 SET_VECTOR(SCNu8, uint8_t)
303 SET_VECTOR(SCNu8, char)
304 SET_VECTOR(SCNi32, int32_t)
305 SET_VECTOR(SCNu32, uint32_t)
306 SET_VECTOR(SCNi64, int64_t)
307 SET_VECTOR(SCNu64, uint64_t)
308 SET_VECTOR(".16g", float32_t)
309 SET_VECTOR(".16g", float64_t)
310 SET_VECTOR(".16Lg", floatmax_t)
311 SET_VECTOR(SCNi16, int16_t)
312 SET_VECTOR(SCNu16, uint16_t)
313 #undef SET_VECTOR
314 
315 #define SET_MATRIX(format, sg_type) \
316 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
317 { \
318  SG_SET_LOCALE_C; \
319  \
320  if (!is_data_transposed) \
321  { \
322  for (int32_t i=0; i<num_vec; i++) \
323  { \
324  int32_t j; \
325  for (j=0; j<num_feat-1; j++) \
326  fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \
327  fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \
328  } \
329  } \
330  else \
331  { \
332  for (int32_t i=0; i<num_feat; i++) \
333  { \
334  int32_t j; \
335  for (j=0; j<num_vec-1; j++) \
336  fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \
337  fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \
338  } \
339  } \
340  \
341  SG_RESET_LOCALE; \
342 }
343 
344 SET_MATRIX(SCNi8, int8_t)
345 SET_MATRIX(SCNu8, uint8_t)
346 SET_MATRIX(SCNu8, char)
347 SET_MATRIX(SCNi32, int32_t)
348 SET_MATRIX(SCNu32, uint32_t)
349 SET_MATRIX(SCNi64, int64_t)
350 SET_MATRIX(SCNu64, uint64_t)
351 SET_MATRIX(".16g", float32_t)
352 SET_MATRIX(".16g", float64_t)
353 SET_MATRIX(".16Lg", floatmax_t)
354 SET_MATRIX(SCNi16, int16_t)
355 SET_MATRIX(SCNu16, uint16_t)
356 #undef SET_MATRIX
357 
358 #define SET_SPARSE_MATRIX(format, sg_type) \
359 void CCSVFile::set_sparse_matrix( \
360  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
361 { \
362  SG_NOTIMPLEMENTED \
363 }
364 
365 SET_SPARSE_MATRIX(SCNi8, bool)
366 SET_SPARSE_MATRIX(SCNi8, int8_t)
367 SET_SPARSE_MATRIX(SCNu8, uint8_t)
368 SET_SPARSE_MATRIX(SCNu8, char)
369 SET_SPARSE_MATRIX(SCNi32, int32_t)
370 SET_SPARSE_MATRIX(SCNu32, uint32_t)
371 SET_SPARSE_MATRIX(SCNi64, int64_t)
372 SET_SPARSE_MATRIX(SCNu64, uint64_t)
376 SET_SPARSE_MATRIX(SCNi16, int16_t)
377 SET_SPARSE_MATRIX(SCNu16, uint16_t)
378 #undef SET_SPARSE_MATRIX
379 
381  SGString<char>*& strings, int32_t& num_str,
382  int32_t& max_string_len)
383 {
384  SGVector<char> line;
385  int32_t current_line_idx=0;
386  int32_t num_tokens=0;
387 
388  max_string_len=0;
389  num_str=get_stats(num_tokens);
390  strings=SG_MALLOC(SGString<char>, num_str);
391 
392  skip_lines(m_num_to_skip);
393  while (m_line_reader->has_next())
394  {
395  line=m_line_reader->read_line();
396  strings[current_line_idx].slen=line.vlen;
397  strings[current_line_idx].string=SG_MALLOC(char, line.vlen);
398  for (int32_t i=0; i<line.vlen; i++)
399  strings[current_line_idx].string[i]=line[i];
400 
401  if (line.vlen>max_string_len)
402  max_string_len=line.vlen;
403 
404  current_line_idx++;
405  }
406 
407  num_str=current_line_idx;
408 }
409 
410 #define GET_STRING_LIST(sg_type) \
411 void CCSVFile::get_string_list( \
412  SGString<sg_type>*& strings, int32_t& num_str, \
413  int32_t& max_string_len) \
414 { \
415  SG_NOTIMPLEMENTED \
416 }
417 
418 GET_STRING_LIST(int8_t)
419 GET_STRING_LIST(uint8_t)
420 GET_STRING_LIST(int32_t)
421 GET_STRING_LIST(uint32_t)
422 GET_STRING_LIST(int64_t)
423 GET_STRING_LIST(uint64_t)
427 GET_STRING_LIST(int16_t)
428 GET_STRING_LIST(uint16_t)
429 #undef GET_STRING_LIST
430 
432  const SGString<char>* strings, int32_t num_str)
433 {
434  for (int32_t i=0; i<num_str; i++)
435  {
436  for (int32_t j=0; j<strings[i].slen; j++)
437  fprintf(file, "%c", strings[i].string[j]);
438  fprintf(file, "\n");
439  }
440 }
441 
442 #define SET_STRING_LIST(sg_type) \
443 void CCSVFile::set_string_list( \
444  const SGString<sg_type>* strings, int32_t num_str) \
445 { \
446  SG_NOTIMPLEMENTED \
447 }
448 
449 SET_STRING_LIST(int8_t)
450 SET_STRING_LIST(uint8_t)
451 SET_STRING_LIST(int32_t)
452 SET_STRING_LIST(uint32_t)
453 SET_STRING_LIST(int64_t)
454 SET_STRING_LIST(uint64_t)
458 SET_STRING_LIST(int16_t)
459 SET_STRING_LIST(uint16_t)
460 #undef SET_STRING_LIST
virtual bool has_next()
Definition: LineReader.cpp:59
virtual ~CCSVFile()
Definition: CSVFile.cpp:48
void set_delimiter(char delimiter)
Definition: CSVFile.cpp:61
int32_t index_t
Definition: common.h:62
#define GET_MATRIX(read_func, sg_type)
Definition: CSVFile.cpp:182
void set_transpose(bool value)
Definition: CSVFile.cpp:56
#define GET_NDARRAY(read_func, sg_type)
Definition: CSVFile.cpp:243
virtual index_t next_token_idx(index_t &start)
FILE * file
Definition: File.h:505
virtual void set_string_list(const SGString< uint8_t > *strings, int32_t num_str)
#define GET_SPARSE_MATRIX(read_func, sg_type)
Definition: CSVFile.cpp:258
#define SET_STRING_LIST(sg_type)
Definition: CSVFile.cpp:442
Class for buffered reading from a ascii file.
Definition: LineReader.h:24
void set_lines_to_skip(int32_t num_lines)
Definition: CSVFile.cpp:71
#define SG_REF(x)
Definition: SGObject.h:51
int32_t get_stats(int32_t &num_tokens)
Definition: CSVFile.cpp:76
virtual SGVector< char > read_line()
Definition: LineReader.cpp:89
#define GET_STRING_LIST(sg_type)
Definition: CSVFile.cpp:410
#define SET_MATRIX(format, sg_type)
Definition: CSVFile.cpp:315
index_t vlen
Definition: SGVector.h:494
Class for reading from a string.
Definition: Parser.h:23
double float64_t
Definition: common.h:50
long double floatmax_t
Definition: common.h:51
A File access base class.
Definition: File.h:34
virtual void skip_line()
Definition: LineReader.cpp:79
#define SET_VECTOR(format, sg_type)
Definition: CSVFile.cpp:280
#define GET_VECTOR(read_func, sg_type)
Definition: CSVFile.cpp:143
#define SET_SPARSE_MATRIX(format, sg_type)
Definition: CSVFile.cpp:358
void set_tokenizer(CTokenizer *tokenizer)
Definition: Parser.cpp:146
float float32_t
Definition: common.h:49
#define SG_UNREF(x)
Definition: SGObject.h:52
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
virtual void set_text(SGVector< char > txt)
index_t slen
Definition: SGString.h:79
virtual void get_string_list(SGString< uint8_t > *&strings, int32_t &num_str, int32_t &max_string_len)

SHOGUN 机器学习工具包 - 项目文档