SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CSVFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evgeniy Andreev (gsomix)
8  */
9 
10 #include <shogun/io/CSVFile.h>
11 
12 #include <shogun/lib/SGVector.h>
13 #include <shogun/lib/SGMatrix.h>
14 
15 using namespace shogun;
16 
18 {
19  init();
20 }
21 
22 CCSVFile::CCSVFile(FILE* f, const char* name) :
23  CFile(f, name)
24 {
25  init();
26  init_with_defaults();
27 }
28 
29 CCSVFile::CCSVFile(int fd, const char* mode, const char* name) :
30  CFile(fd, mode, name)
31 {
32  init();
33  init_with_defaults();
34 }
35 
36 CCSVFile::CCSVFile(const char* fname, char rw, const char* name) :
37  CFile(fname, rw, name)
38 {
39  init();
40  init_with_defaults();
41 }
42 
44 {
45  SG_UNREF(m_tokenizer);
46  SG_UNREF(m_line_tokenizer);
47  SG_UNREF(m_parser);
48  SG_UNREF(m_line_reader);
49 }
50 
51 void CCSVFile::set_transpose(bool value)
52 {
53  is_data_transposed=value;
54 }
55 
56 void CCSVFile::set_delimiter(char delimiter)
57 {
58  m_tokenizer->delimiters[m_delimiter]=0;
59 
60  m_delimiter=delimiter;
61  m_tokenizer->delimiters[m_delimiter]=1;
62 
63  m_tokenizer->delimiters[' ']=1;
64 }
65 
66 void CCSVFile::set_lines_to_skip(int32_t num_lines)
67 {
68  m_num_to_skip=num_lines;
69 }
70 
71 int32_t CCSVFile::get_stats(int32_t& num_tokens)
72 {
73  int32_t num_lines=0;
74  num_tokens=-1;
75 
76  while (m_line_reader->has_next())
77  {
78  if (num_tokens==-1)
79  {
80  SGVector<char> line=m_line_reader->read_line();
81  m_tokenizer->set_text(line);
82 
83  num_tokens=0;
84  while (m_tokenizer->has_next())
85  {
86  index_t temp_start=0;
87  m_tokenizer->next_token_idx(temp_start);
88  num_tokens++;
89  }
90  }
91  else
92  m_line_reader->skip_line();
93  num_lines++;
94  }
95  m_line_reader->reset();
96 
97  return num_lines;
98 }
99 
100 void CCSVFile::init()
101 {
102  is_data_transposed=false;
103  m_delimiter=0;
104  m_num_to_skip=0;
105 
106  m_tokenizer=NULL;
107  m_line_tokenizer=NULL;
108  m_parser=NULL;
109  m_line_reader=NULL;
110 }
111 
112 void CCSVFile::init_with_defaults()
113 {
114  is_data_transposed=false;
115  m_delimiter=',';
116 
117  m_tokenizer=new CDelimiterTokenizer(true);
118  m_tokenizer->delimiters[m_delimiter]=1;
119  m_tokenizer->delimiters[' ']=1;
120  SG_REF(m_tokenizer);
121 
122  m_line_tokenizer=new CDelimiterTokenizer(true);
123  m_line_tokenizer->delimiters['\n']=1;
124  SG_REF(m_line_tokenizer);
125 
126  m_parser=new CParser();
127  m_parser->set_tokenizer(m_tokenizer);
128 
129  m_line_reader=new CLineReader(file, m_line_tokenizer);
130 }
131 
132 void CCSVFile::skip_lines(int32_t num_lines)
133 {
134  for (int32_t i=0; i<num_lines; i++)
135  m_line_reader->skip_line();
136 }
137 
138 #define GET_VECTOR(read_func, sg_type) \
139 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \
140 { \
141  if (!m_line_reader->has_next()) \
142  return; \
143  \
144  int32_t num_feat=0; \
145  int32_t num_vec=0; \
146  get_matrix(vector, num_feat, num_vec); \
147  \
148  if (num_feat==1) \
149  { \
150  len=num_vec; \
151  return; \
152  } \
153  \
154  if (num_vec==1) \
155  { \
156  len=num_feat; \
157  return; \
158  } \
159  \
160  len=0; \
161 }
162 
163 GET_VECTOR(read_char, int8_t)
164 GET_VECTOR(read_byte, uint8_t)
165 GET_VECTOR(read_char, char)
166 GET_VECTOR(read_int, int32_t)
167 GET_VECTOR(read_uint, uint32_t)
168 GET_VECTOR(read_short_real, float32_t)
169 GET_VECTOR(read_real, float64_t)
170 GET_VECTOR(read_long_real, floatmax_t)
171 GET_VECTOR(read_short, int16_t)
172 GET_VECTOR(read_word, uint16_t)
173 GET_VECTOR(read_long, int64_t)
174 GET_VECTOR(read_ulong, uint64_t)
175 #undef GET_VECTOR
176 
177 #define GET_MATRIX(read_func, sg_type) \
178 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
179 { \
180  int32_t num_lines=0; \
181  int32_t num_tokens=-1; \
182  int32_t current_line_idx=0; \
183  SGVector<char> line; \
184  \
185  skip_lines(m_num_to_skip); \
186  num_lines=get_stats(num_tokens); \
187  \
188  SG_SET_LOCALE_C; \
189  \
190  matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \
191  skip_lines(m_num_to_skip); \
192  while (m_line_reader->has_next()) \
193  { \
194  line=m_line_reader->read_line(); \
195  m_parser->set_text(line); \
196  \
197  for (int32_t i=0; i<num_tokens; i++) \
198  { \
199  if (!m_parser->has_next()) \
200  return; \
201  \
202  if (!is_data_transposed) \
203  matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \
204  else \
205  matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \
206  } \
207  current_line_idx++; \
208  } \
209  \
210  SG_RESET_LOCALE; \
211  \
212  if (!is_data_transposed) \
213  { \
214  num_feat=num_tokens; \
215  num_vec=num_lines; \
216  } \
217  else \
218  { \
219  num_feat=num_lines; \
220  num_vec=num_tokens; \
221  } \
222 }
223 
224 GET_MATRIX(read_char, int8_t)
225 GET_MATRIX(read_byte, uint8_t)
226 GET_MATRIX(read_char, char)
227 GET_MATRIX(read_int, int32_t)
228 GET_MATRIX(read_uint, uint32_t)
229 GET_MATRIX(read_short_real, float32_t)
230 GET_MATRIX(read_real, float64_t)
231 GET_MATRIX(read_long_real, floatmax_t)
232 GET_MATRIX(read_short, int16_t)
233 GET_MATRIX(read_word, uint16_t)
234 GET_MATRIX(read_long, int64_t)
235 GET_MATRIX(read_ulong, uint64_t)
236 #undef GET_MATRIX
237 
238 #define GET_NDARRAY(read_func, sg_type) \
239 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \
240 { \
241  SG_NOTIMPLEMENTED \
242 }
243 
244 GET_NDARRAY(read_byte, uint8_t)
245 GET_NDARRAY(read_char, char)
246 GET_NDARRAY(read_int, int32_t)
247 GET_NDARRAY(read_short_real, float32_t)
248 GET_NDARRAY(read_real, float64_t)
249 GET_NDARRAY(read_short, int16_t)
250 GET_NDARRAY(read_word, uint16_t)
251 #undef GET_NDARRAY
252 
253 #define GET_SPARSE_MATRIX(read_func, sg_type) \
254 void CCSVFile::get_sparse_matrix( \
255  SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
256 { \
257  SG_NOTIMPLEMENTED \
258 }
259 
260 GET_SPARSE_MATRIX(read_char, bool)
261 GET_SPARSE_MATRIX(read_char, int8_t)
262 GET_SPARSE_MATRIX(read_byte, uint8_t)
263 GET_SPARSE_MATRIX(read_char, char)
264 GET_SPARSE_MATRIX(read_int, int32_t)
265 GET_SPARSE_MATRIX(read_uint, uint32_t)
266 GET_SPARSE_MATRIX(read_short_real, float32_t)
267 GET_SPARSE_MATRIX(read_real, float64_t)
268 GET_SPARSE_MATRIX(read_long_real, floatmax_t)
269 GET_SPARSE_MATRIX(read_short, int16_t)
270 GET_SPARSE_MATRIX(read_word, uint16_t)
271 GET_SPARSE_MATRIX(read_long, int64_t)
272 GET_SPARSE_MATRIX(read_ulong, uint64_t)
273 #undef GET_SPARSE_MATRIX
274 
275 #define SET_VECTOR(format, sg_type) \
276 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \
277 { \
278  SG_SET_LOCALE_C; \
279  \
280  if (!is_data_transposed) \
281  { \
282  for (int32_t i=0; i<len; i++) \
283  fprintf(file, "%" format "\n", vector[i]); \
284  } \
285  else \
286  { \
287  int32_t i; \
288  for (i=0; i<len-1; i++) \
289  fprintf(file, "%" format "%c", vector[i], m_delimiter); \
290  fprintf(file, "%" format "\n", vector[i]); \
291  } \
292  \
293  SG_RESET_LOCALE; \
294 }
295 
296 SET_VECTOR(SCNi8, int8_t)
297 SET_VECTOR(SCNu8, uint8_t)
298 SET_VECTOR(SCNu8, char)
299 SET_VECTOR(SCNi32, int32_t)
300 SET_VECTOR(SCNu32, uint32_t)
301 SET_VECTOR(SCNi64, int64_t)
302 SET_VECTOR(SCNu64, uint64_t)
303 SET_VECTOR(".16g", float32_t)
304 SET_VECTOR(".16g", float64_t)
305 SET_VECTOR(".16Lg", floatmax_t)
306 SET_VECTOR(SCNi16, int16_t)
307 SET_VECTOR(SCNu16, uint16_t)
308 #undef SET_VECTOR
309 
310 #define SET_MATRIX(format, sg_type) \
311 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
312 { \
313  SG_SET_LOCALE_C; \
314  \
315  if (!is_data_transposed) \
316  { \
317  for (int32_t i=0; i<num_vec; i++) \
318  { \
319  int32_t j; \
320  for (j=0; j<num_feat-1; j++) \
321  fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \
322  fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \
323  } \
324  } \
325  else \
326  { \
327  for (int32_t i=0; i<num_feat; i++) \
328  { \
329  int32_t j; \
330  for (j=0; j<num_vec-1; j++) \
331  fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \
332  fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \
333  } \
334  } \
335  \
336  SG_RESET_LOCALE; \
337 }
338 
339 SET_MATRIX(SCNi8, int8_t)
340 SET_MATRIX(SCNu8, uint8_t)
341 SET_MATRIX(SCNu8, char)
342 SET_MATRIX(SCNi32, int32_t)
343 SET_MATRIX(SCNu32, uint32_t)
344 SET_MATRIX(SCNi64, int64_t)
345 SET_MATRIX(SCNu64, uint64_t)
346 SET_MATRIX(".16g", float32_t)
347 SET_MATRIX(".16g", float64_t)
348 SET_MATRIX(".16Lg", floatmax_t)
349 SET_MATRIX(SCNi16, int16_t)
350 SET_MATRIX(SCNu16, uint16_t)
351 #undef SET_MATRIX
352 
353 #define SET_SPARSE_MATRIX(format, sg_type) \
354 void CCSVFile::set_sparse_matrix( \
355  const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
356 { \
357  SG_NOTIMPLEMENTED \
358 }
359 
360 SET_SPARSE_MATRIX(SCNi8, bool)
361 SET_SPARSE_MATRIX(SCNi8, int8_t)
362 SET_SPARSE_MATRIX(SCNu8, uint8_t)
363 SET_SPARSE_MATRIX(SCNu8, char)
364 SET_SPARSE_MATRIX(SCNi32, int32_t)
365 SET_SPARSE_MATRIX(SCNu32, uint32_t)
366 SET_SPARSE_MATRIX(SCNi64, int64_t)
367 SET_SPARSE_MATRIX(SCNu64, uint64_t)
371 SET_SPARSE_MATRIX(SCNi16, int16_t)
372 SET_SPARSE_MATRIX(SCNu16, uint16_t)
373 #undef SET_SPARSE_MATRIX
374 
376  SGString<char>*& strings, int32_t& num_str,
377  int32_t& max_string_len)
378 {
379  SGVector<char> line;
380  int32_t current_line_idx=0;
381  int32_t num_tokens=0;
382 
383  max_string_len=0;
384  num_str=get_stats(num_tokens);
385  strings=SG_MALLOC(SGString<char>, num_str);
386 
387  skip_lines(m_num_to_skip);
388  while (m_line_reader->has_next())
389  {
390  line=m_line_reader->read_line();
391  strings[current_line_idx].slen=line.vlen;
392  strings[current_line_idx].string=SG_MALLOC(char, line.vlen);
393  for (int32_t i=0; i<line.vlen; i++)
394  strings[current_line_idx].string[i]=line[i];
395 
396  if (line.vlen>max_string_len)
397  max_string_len=line.vlen;
398 
399  current_line_idx++;
400  }
401 
402  num_str=current_line_idx;
403 }
404 
405 #define GET_STRING_LIST(sg_type) \
406 void CCSVFile::get_string_list( \
407  SGString<sg_type>*& strings, int32_t& num_str, \
408  int32_t& max_string_len) \
409 { \
410  SG_NOTIMPLEMENTED \
411 }
412 
413 GET_STRING_LIST(int8_t)
414 GET_STRING_LIST(uint8_t)
415 GET_STRING_LIST(int32_t)
416 GET_STRING_LIST(uint32_t)
417 GET_STRING_LIST(int64_t)
418 GET_STRING_LIST(uint64_t)
422 GET_STRING_LIST(int16_t)
423 GET_STRING_LIST(uint16_t)
424 #undef GET_STRING_LIST
425 
427  const SGString<char>* strings, int32_t num_str)
428 {
429  for (int32_t i=0; i<num_str; i++)
430  {
431  for (int32_t j=0; j<strings[i].slen; j++)
432  fprintf(file, "%c", strings[i].string[j]);
433  fprintf(file, "\n");
434  }
435 }
436 
437 #define SET_STRING_LIST(sg_type) \
438 void CCSVFile::set_string_list( \
439  const SGString<sg_type>* strings, int32_t num_str) \
440 { \
441  SG_NOTIMPLEMENTED \
442 }
443 
444 SET_STRING_LIST(int8_t)
445 SET_STRING_LIST(uint8_t)
446 SET_STRING_LIST(int32_t)
447 SET_STRING_LIST(uint32_t)
448 SET_STRING_LIST(int64_t)
449 SET_STRING_LIST(uint64_t)
453 SET_STRING_LIST(int16_t)
454 SET_STRING_LIST(uint16_t)
455 #undef SET_STRING_LIST
456 
458 {
459  ret.erase();
460  char *last = s.start;
461  for (; s.start != s.end; s.start++)
462  {
463  if (*s.start == delim)
464  {
465  if (s.start != last)
466  {
467  substring temp = {last,s.start};
468  ret.push(temp);
469  }
470  last = s.start+1;
471  }
472  }
473  if (s.start != last)
474  {
475  substring final = {last, s.start};
476  ret.push(final);
477  }
478 }

SHOGUN Machine Learning Toolbox - Documentation