SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CSVFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/CSVFile.h>
11 
12 #include <shogun/io/SGIO.h>
13 #include <shogun/lib/SGVector.h>
14 #include <shogun/io/LineReader.h>
15 #include <shogun/io/Parser.h>
17 
18 using namespace shogun;
19 
21 {
22  init();
23 }
24 
25 CCSVFile::CCSVFile(FILE* f, const char* name) :
26  CFile(f, name)
27 {
28  init();
29  init_with_defaults();
30 }
31 
32 CCSVFile::CCSVFile(int fd, const char* mode, const char* name) :
33  CFile(fd, mode, name)
34 {
35  init();
36  init_with_defaults();
37 }
38 
39 CCSVFile::CCSVFile(const char* fname, char rw, const char* name) :
40  CFile(fname, rw, name)
41 {
42  init();
43  init_with_defaults();
44 }
45 
47 {
48  SG_UNREF(m_tokenizer);
49  SG_UNREF(m_line_tokenizer);
50  SG_UNREF(m_parser);
51  SG_UNREF(m_line_reader);
52 }
53 
54 void CCSVFile::set_transpose(bool value)
55 {
56  is_data_transposed=value;
57 }
58 
59 void CCSVFile::set_delimiter(char delimiter)
60 {
61  m_tokenizer->delimiters[m_delimiter]=0;
62 
63  m_delimiter=delimiter;
64  m_tokenizer->delimiters[m_delimiter]=1;
65 
66  m_tokenizer->delimiters[' ']=1;
67 }
68 
69 void CCSVFile::set_lines_to_skip(int32_t num_lines)
70 {
71  m_num_to_skip=num_lines;
72 }
73 
74 int32_t CCSVFile::get_stats(int32_t& num_tokens)
75 {
76  int32_t num_lines=0;
77  num_tokens=-1;
78 
79  while (m_line_reader->has_next())
80  {
81  if (num_tokens==-1)
82  {
83  SGVector<char> line=m_line_reader->read_line();
84  m_tokenizer->set_text(line);
85 
86  num_tokens=0;
87  while (m_tokenizer->has_next())
88  {
89  index_t temp_start=0;
90  m_tokenizer->next_token_idx(temp_start);
91  num_tokens++;
92  }
93  }
94  else
95  m_line_reader->skip_line();
96  num_lines++;
97  }
98  m_line_reader->reset();
99 
100  return num_lines;
101 }
102 
103 void CCSVFile::init()
104 {
105  is_data_transposed=false;
106  m_delimiter=0;
107  m_num_to_skip=0;
108 
109  m_tokenizer=NULL;
110  m_line_tokenizer=NULL;
111  m_parser=NULL;
112  m_line_reader=NULL;
113 }
114 
115 void CCSVFile::init_with_defaults()
116 {
117  is_data_transposed=false;
118  m_delimiter=',';
119 
120  m_tokenizer=new CDelimiterTokenizer(true);
121  m_tokenizer->delimiters[m_delimiter]=1;
122  m_tokenizer->delimiters[' ']=1;
123  SG_REF(m_tokenizer);
124 
125  m_line_tokenizer=new CDelimiterTokenizer(true);
126  m_line_tokenizer->delimiters['\n']=1;
127  SG_REF(m_line_tokenizer);
128 
129  m_parser=new CParser();
130  m_parser->set_tokenizer(m_tokenizer);
131 
132  m_line_reader=new CLineReader(file, m_line_tokenizer);
133 }
134 
135 void CCSVFile::skip_lines(int32_t num_lines)
136 {
137  for (int32_t i=0; i<num_lines; i++)
138  m_line_reader->skip_line();
139 }
140 
141 #define GET_VECTOR(read_func, sg_type) \
142 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \
143 { \
144  if (!m_line_reader->has_next()) \
145  return; \
146  \
147  int32_t num_feat=0; \
148  int32_t num_vec=0; \
149  get_matrix(vector, num_feat, num_vec); \
150  \
151  if (num_feat==1) \
152  { \
153  len=num_vec; \
154  return; \
155  } \
156  \
157  if (num_vec==1) \
158  { \
159  len=num_feat; \
160  return; \
161  } \
162  \
163  len=0; \
164 }
165 
166 GET_VECTOR(read_char, int8_t)
167 GET_VECTOR(read_byte, uint8_t)
168 GET_VECTOR(read_char, char)
169 GET_VECTOR(read_int, int32_t)
170 GET_VECTOR(read_uint, uint32_t)
171 GET_VECTOR(read_short_real, float32_t)
172 GET_VECTOR(read_real, float64_t)
173 GET_VECTOR(read_long_real, floatmax_t)
174 GET_VECTOR(read_short, int16_t)
175 GET_VECTOR(read_word, uint16_t)
176 GET_VECTOR(read_long, int64_t)
177 GET_VECTOR(read_ulong, uint64_t)
178 #undef GET_VECTOR
179 
180 #define GET_MATRIX(read_func, sg_type) \
181 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
182 { \
183  int32_t num_lines=0; \
184  int32_t num_tokens=-1; \
185  int32_t current_line_idx=0; \
186  SGVector<char> line; \
187  \
188  skip_lines(m_num_to_skip); \
189  num_lines=get_stats(num_tokens); \
190  \
191  SG_SET_LOCALE_C; \
192  \
193  matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \
194  skip_lines(m_num_to_skip); \
195  while (m_line_reader->has_next()) \
196  { \
197  line=m_line_reader->read_line(); \
198  m_parser->set_text(line); \
199  \
200  for (int32_t i=0; i<num_tokens; i++) \
201  { \
202  if (!m_parser->has_next()) \
203  return; \
204  \
205  if (!is_data_transposed) \
206  matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \
207  else \
208  matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \
209  } \
210  current_line_idx++; \
211  } \
212  \
213  SG_RESET_LOCALE; \
214  \
215  if (!is_data_transposed) \
216  { \
217  num_feat=num_tokens; \
218  num_vec=num_lines; \
219  } \
220  else \
221  { \
222  num_feat=num_lines; \
223  num_vec=num_tokens; \
224  } \
225 }
226 
227 GET_MATRIX(read_char, int8_t)
228 GET_MATRIX(read_byte, uint8_t)
229 GET_MATRIX(read_char, char)
230 GET_MATRIX(read_int, int32_t)
231 GET_MATRIX(read_uint, uint32_t)
232 GET_MATRIX(read_short_real, float32_t)
233 GET_MATRIX(read_real, float64_t)
234 GET_MATRIX(read_long_real, floatmax_t)
235 GET_MATRIX(read_short, int16_t)
236 GET_MATRIX(read_word, uint16_t)
237 GET_MATRIX(read_long, int64_t)
238 GET_MATRIX(read_ulong, uint64_t)
239 #undef GET_MATRIX
240 
241 #define GET_NDARRAY(read_func, sg_type) \
242 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \
243 { \
244  SG_NOTIMPLEMENTED \
245 }
246 
247 GET_NDARRAY(read_byte, uint8_t)
248 GET_NDARRAY(read_char, char)
249 GET_NDARRAY(read_int, int32_t)
250 GET_NDARRAY(read_short_real, float32_t)
251 GET_NDARRAY(read_real, float64_t)
252 GET_NDARRAY(read_short, int16_t)
253 GET_NDARRAY(read_word, uint16_t)
254 #undef GET_NDARRAY
255 
256 #define GET_SPARSE_MATRIX(read_func, sg_type) \
257 void CCSVFile::get_sparse_matrix( \
258  SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
259 { \
260  SG_NOTIMPLEMENTED \
261 }
262 
263 GET_SPARSE_MATRIX(read_char, bool)
264 GET_SPARSE_MATRIX(read_char, int8_t)
265 GET_SPARSE_MATRIX(read_byte, uint8_t)
266 GET_SPARSE_MATRIX(read_char, char)
267 GET_SPARSE_MATRIX(read_int, int32_t)
268 GET_SPARSE_MATRIX(read_uint, uint32_t)
269 GET_SPARSE_MATRIX(read_short_real, float32_t)
270 GET_SPARSE_MATRIX(read_real, float64_t)
271 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
272 GET_SPARSE_MATRIX(read_short, int16_t)
273 GET_SPARSE_MATRIX(read_word, uint16_t)
274 GET_SPARSE_MATRIX(read_long, int64_t)
275 GET_SPARSE_MATRIX(read_ulong, uint64_t)
276 #undef GET_SPARSE_MATRIX
277 
278 #define SET_VECTOR(format, sg_type) \
279 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \
280 { \
281  SG_SET_LOCALE_C; \
282  \
283  if (!is_data_transposed) \
284  { \
285  for (int32_t i=0; i<len; i++) \
286  fprintf(file, "%" format "\n", vector[i]); \
287  } \
288  else \
289  { \
290  int32_t i; \
291  for (i=0; i<len-1; i++) \
292  fprintf(file, "%" format "%c", vector[i], m_delimiter); \
293  fprintf(file, "%" format "\n", vector[i]); \
294  } \
295  \
296  SG_RESET_LOCALE; \
297 }
298 
299 SET_VECTOR(SCNi8, int8_t)
300 SET_VECTOR(SCNu8, uint8_t)
301 SET_VECTOR(SCNu8, char)
302 SET_VECTOR(SCNi32, int32_t)
303 SET_VECTOR(SCNu32, uint32_t)
304 SET_VECTOR(SCNi64, int64_t)
305 SET_VECTOR(SCNu64, uint64_t)
306 SET_VECTOR(".16g", float32_t)
307 SET_VECTOR(".16g", float64_t)
308 SET_VECTOR(".16Lg", floatmax_t)
309 SET_VECTOR(SCNi16, int16_t)
310 SET_VECTOR(SCNu16, uint16_t)
311 #undef SET_VECTOR
312 
313 #define SET_MATRIX(format, sg_type) \
314 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
315 { \
316  SG_SET_LOCALE_C; \
317  \
318  if (!is_data_transposed) \
319  { \
320  for (int32_t i=0; i<num_vec; i++) \
321  { \
322  int32_t j; \
323  for (j=0; j<num_feat-1; j++) \
324  fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \
325  fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \
326  } \
327  } \
328  else \
329  { \
330  for (int32_t i=0; i<num_feat; i++) \
331  { \
332  int32_t j; \
333  for (j=0; j<num_vec-1; j++) \
334  fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \
335  fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \
336  } \
337  } \
338  \
339  SG_RESET_LOCALE; \
340 }
341 
342 SET_MATRIX(SCNi8, int8_t)
343 SET_MATRIX(SCNu8, uint8_t)
344 SET_MATRIX(SCNu8, char)
345 SET_MATRIX(SCNi32, int32_t)
346 SET_MATRIX(SCNu32, uint32_t)
347 SET_MATRIX(SCNi64, int64_t)
348 SET_MATRIX(SCNu64, uint64_t)
349 SET_MATRIX(".16g", float32_t)
350 SET_MATRIX(".16g", float64_t)
351 SET_MATRIX(".16Lg", floatmax_t)
352 SET_MATRIX(SCNi16, int16_t)
353 SET_MATRIX(SCNu16, uint16_t)
354 #undef SET_MATRIX
355 
356 #define SET_SPARSE_MATRIX(format, sg_type) \
357 void CCSVFile::set_sparse_matrix( \
358  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
359 { \
360  SG_NOTIMPLEMENTED \
361 }
362 
363 SET_SPARSE_MATRIX(SCNi8, bool)
364 SET_SPARSE_MATRIX(SCNi8, int8_t)
365 SET_SPARSE_MATRIX(SCNu8, uint8_t)
366 SET_SPARSE_MATRIX(SCNu8, char)
367 SET_SPARSE_MATRIX(SCNi32, int32_t)
368 SET_SPARSE_MATRIX(SCNu32, uint32_t)
369 SET_SPARSE_MATRIX(SCNi64, int64_t)
370 SET_SPARSE_MATRIX(SCNu64, uint64_t)
374 SET_SPARSE_MATRIX(SCNi16, int16_t)
375 SET_SPARSE_MATRIX(SCNu16, uint16_t)
376 #undef SET_SPARSE_MATRIX
377 
379  SGString<char>*& strings, int32_t& num_str,
380  int32_t& max_string_len)
381 {
382  SGVector<char> line;
383  int32_t current_line_idx=0;
384  int32_t num_tokens=0;
385 
386  max_string_len=0;
387  num_str=get_stats(num_tokens);
388  strings=SG_MALLOC(SGString<char>, num_str);
389 
390  skip_lines(m_num_to_skip);
391  while (m_line_reader->has_next())
392  {
393  line=m_line_reader->read_line();
394  strings[current_line_idx].slen=line.vlen;
395  strings[current_line_idx].string=SG_MALLOC(char, line.vlen);
396  for (int32_t i=0; i<line.vlen; i++)
397  strings[current_line_idx].string[i]=line[i];
398 
399  if (line.vlen>max_string_len)
400  max_string_len=line.vlen;
401 
402  current_line_idx++;
403  }
404 
405  num_str=current_line_idx;
406 }
407 
408 #define GET_STRING_LIST(sg_type) \
409 void CCSVFile::get_string_list( \
410  SGString<sg_type>*& strings, int32_t& num_str, \
411  int32_t& max_string_len) \
412 { \
413  SG_NOTIMPLEMENTED \
414 }
415 
416 GET_STRING_LIST(int8_t)
417 GET_STRING_LIST(uint8_t)
418 GET_STRING_LIST(int32_t)
419 GET_STRING_LIST(uint32_t)
420 GET_STRING_LIST(int64_t)
421 GET_STRING_LIST(uint64_t)
425 GET_STRING_LIST(int16_t)
426 GET_STRING_LIST(uint16_t)
427 #undef GET_STRING_LIST
428 
430  const SGString<char>* strings, int32_t num_str)
431 {
432  for (int32_t i=0; i<num_str; i++)
433  {
434  for (int32_t j=0; j<strings[i].slen; j++)
435  fprintf(file, "%c", strings[i].string[j]);
436  fprintf(file, "\n");
437  }
438 }
439 
440 #define SET_STRING_LIST(sg_type) \
441 void CCSVFile::set_string_list( \
442  const SGString<sg_type>* strings, int32_t num_str) \
443 { \
444  SG_NOTIMPLEMENTED \
445 }
446 
447 SET_STRING_LIST(int8_t)
448 SET_STRING_LIST(uint8_t)
449 SET_STRING_LIST(int32_t)
450 SET_STRING_LIST(uint32_t)
451 SET_STRING_LIST(int64_t)
452 SET_STRING_LIST(uint64_t)
456 SET_STRING_LIST(int16_t)
457 SET_STRING_LIST(uint16_t)
458 #undef SET_STRING_LIST

SHOGUN Machine Learning Toolbox - Documentation