SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingAsciiFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
13 
14 #include <ctype.h>
15 
16 using namespace shogun;
17 
19  : CStreamingFile()
20 {
21  SG_UNSTABLE("CStreamingAsciiFile::CStreamingAsciiFile()", "\n")
22  m_delimiter = ' ';
23 }
24 
25 CStreamingAsciiFile::CStreamingAsciiFile(const char* fname, char rw)
26  : CStreamingFile(fname, rw)
27 {
28  m_delimiter = ' ';
29 }
30 
32 {
33 }
34 
35 /* Methods for reading dense vectors from an ascii file */
36 
37 #define GET_VECTOR(fname, conv, sg_type) \
38 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& num_feat) \
39 { \
40  char* buffer = NULL; \
41  ssize_t bytes_read; \
42  int32_t old_len = num_feat; \
43  \
44  SG_SET_LOCALE_C; \
45  bytes_read = buf->read_line(buffer); \
46  \
47  if (bytes_read<=0) \
48  { \
49  vector=NULL; \
50  num_feat=-1; \
51  SG_RESET_LOCALE; \
52  return; \
53  } \
54  \
55  /* determine num_feat, populate dynamic array */ \
56  int32_t nf=0; \
57  num_feat=0; \
58  \
59  char* ptr_item=NULL; \
60  char* ptr_data=buffer; \
61  DynArray<char*>* items=new DynArray<char*>(); \
62  \
63  while (*ptr_data) \
64  { \
65  if ((*ptr_data=='\n') || \
66  (ptr_data - buffer >= bytes_read)) \
67  { \
68  if (ptr_item) \
69  nf++; \
70  \
71  append_item(items, ptr_data, ptr_item); \
72  num_feat=nf; \
73  \
74  nf=0; \
75  ptr_item=NULL; \
76  break; \
77  } \
78  else if (!isblank(*ptr_data) && !ptr_item) \
79  { \
80  ptr_item=ptr_data; \
81  } \
82  else if (isblank(*ptr_data) && ptr_item) \
83  { \
84  append_item(items, ptr_data, ptr_item); \
85  ptr_item=NULL; \
86  nf++; \
87  } \
88  \
89  ptr_data++; \
90  } \
91  \
92  SG_DEBUG("num_feat %d\n", num_feat) \
93  \
94  /* now copy data into vector */ \
95  if (old_len < num_feat) \
96  vector=SG_REALLOC(sg_type, vector, old_len, num_feat); \
97  \
98  for (int32_t i=0; i<num_feat; i++) \
99  { \
100  char* item=items->get_element(i); \
101  vector[i]=conv(item); \
102  SG_FREE(item); \
103  } \
104  delete items; \
105  SG_RESET_LOCALE; \
106 }
107 
108 GET_VECTOR(get_bool_vector, str_to_bool, bool)
109 GET_VECTOR(get_byte_vector, atoi, uint8_t)
110 GET_VECTOR(get_char_vector, atoi, char)
111 GET_VECTOR(get_int_vector, atoi, int32_t)
112 GET_VECTOR(get_short_vector, atoi, int16_t)
113 GET_VECTOR(get_word_vector, atoi, uint16_t)
114 GET_VECTOR(get_int8_vector, atoi, int8_t)
115 GET_VECTOR(get_uint_vector, atoi, uint32_t)
116 GET_VECTOR(get_long_vector, atoi, int64_t)
117 GET_VECTOR(get_ulong_vector, atoi, uint64_t)
118 GET_VECTOR(get_longreal_vector, atoi, floatmax_t)
119 #undef GET_VECTOR
120 
121 #define GET_FLOAT_VECTOR(sg_type) \
122  void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& len)\
123  { \
124  char *line=NULL; \
125  SG_SET_LOCALE_C; \
126  int32_t num_chars = buf->read_line(line); \
127  int32_t old_len = len; \
128  \
129  if (num_chars == 0) \
130  { \
131  len = -1; \
132  SG_RESET_LOCALE; \
133  return; \
134  } \
135  \
136  substring example_string = {line, line + num_chars}; \
137  \
138  CCSVFile::tokenize(m_delimiter, example_string, words); \
139  \
140  len = words.index(); \
141  substring* feature_start = &words[0]; \
142  \
143  if (len > old_len) \
144  vector = SG_REALLOC(sg_type, vector, old_len, len); \
145  \
146  int32_t j=0; \
147  for (substring* i = feature_start; i != words.end; i++) \
148  { \
149  vector[j++] = SGIO::float_of_substring(*i); \
150  } \
151  SG_RESET_LOCALE; \
152  }
153 
156 #undef GET_FLOAT_VECTOR
157 
158 /* Methods for reading a dense vector and a label from an ascii file */
159 
160 #define GET_VECTOR_AND_LABEL(fname, conv, sg_type) \
161  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& num_feat, float64_t& label) \
162  { \
163  char* buffer = NULL; \
164  ssize_t bytes_read; \
165  int32_t old_len = num_feat; \
166  SG_SET_LOCALE_C; \
167  \
168  bytes_read = buf->read_line(buffer); \
169  \
170  if (bytes_read<=0) \
171  { \
172  vector=NULL; \
173  num_feat=-1; \
174  SG_RESET_LOCALE; \
175  return; \
176  } \
177  \
178  /* determine num_feat, populate dynamic array */ \
179  int32_t nf=0; \
180  num_feat=0; \
181  \
182  char* ptr_item=NULL; \
183  char* ptr_data=buffer; \
184  DynArray<char*>* items=new DynArray<char*>(); \
185  \
186  while (*ptr_data) \
187  { \
188  if ((*ptr_data=='\n') || \
189  (ptr_data - buffer >= bytes_read)) \
190  { \
191  if (ptr_item) \
192  nf++; \
193  \
194  append_item(items, ptr_data, ptr_item); \
195  num_feat=nf; \
196  \
197  nf=0; \
198  ptr_item=NULL; \
199  break; \
200  } \
201  else if (!isblank(*ptr_data) && !ptr_item) \
202  { \
203  ptr_item=ptr_data; \
204  } \
205  else if (isblank(*ptr_data) && ptr_item) \
206  { \
207  append_item(items, ptr_data, ptr_item); \
208  ptr_item=NULL; \
209  nf++; \
210  } \
211  \
212  ptr_data++; \
213  } \
214  \
215  SG_DEBUG("num_feat %d\n", num_feat) \
216  /* The first element is the label */ \
217  label=atof(items->get_element(0)); \
218  /* now copy rest of the data into vector */ \
219  if (old_len < num_feat - 1) \
220  vector=SG_REALLOC(sg_type, vector, old_len, num_feat-1); \
221  \
222  for (int32_t i=1; i<num_feat; i++) \
223  { \
224  char* item=items->get_element(i); \
225  vector[i-1]=conv(item); \
226  SG_FREE(item); \
227  } \
228  delete items; \
229  num_feat--; \
230  SG_RESET_LOCALE; \
231  }
232 
233 GET_VECTOR_AND_LABEL(get_bool_vector_and_label, str_to_bool, bool)
234 GET_VECTOR_AND_LABEL(get_byte_vector_and_label, atoi, uint8_t)
235 GET_VECTOR_AND_LABEL(get_char_vector_and_label, atoi, char)
236 GET_VECTOR_AND_LABEL(get_int_vector_and_label, atoi, int32_t)
237 GET_VECTOR_AND_LABEL(get_short_vector_and_label, atoi, int16_t)
238 GET_VECTOR_AND_LABEL(get_word_vector_and_label, atoi, uint16_t)
239 GET_VECTOR_AND_LABEL(get_int8_vector_and_label, atoi, int8_t)
240 GET_VECTOR_AND_LABEL(get_uint_vector_and_label, atoi, uint32_t)
241 GET_VECTOR_AND_LABEL(get_long_vector_and_label, atoi, int64_t)
242 GET_VECTOR_AND_LABEL(get_ulong_vector_and_label, atoi, uint64_t)
243 GET_VECTOR_AND_LABEL(get_longreal_vector_and_label, atoi, floatmax_t)
244 #undef GET_VECTOR_AND_LABEL
245 
246 #define GET_FLOAT_VECTOR_AND_LABEL(sg_type) \
247  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
248  { \
249  char *line=NULL; \
250  SG_SET_LOCALE_C; \
251  int32_t num_chars = buf->read_line(line); \
252  int32_t old_len = len; \
253  \
254  if (num_chars == 0) \
255  { \
256  len = -1; \
257  SG_RESET_LOCALE; \
258  return; \
259  } \
260  \
261  substring example_string = {line, line + num_chars}; \
262  \
263  CCSVFile::tokenize(m_delimiter, example_string, words); \
264  \
265  label = SGIO::float_of_substring(words[0]); \
266  \
267  len = words.index() - 1; \
268  substring* feature_start = &words[1]; \
269  \
270  if (len > old_len) \
271  vector = SG_REALLOC(sg_type, vector, old_len, len); \
272  \
273  int32_t j=0; \
274  for (substring* i = feature_start; i != words.end; i++) \
275  { \
276  vector[j++] = SGIO::float_of_substring(*i); \
277  } \
278  SG_RESET_LOCALE; \
279  }
280 
283 #undef GET_FLOAT_VECTOR_AND_LABEL
284 
285 /* Methods for reading a string vector from an ascii file (see StringFeatures) */
286 
287 #define GET_STRING(fname, conv, sg_type) \
288 void CStreamingAsciiFile::get_string(sg_type*& vector, int32_t& len) \
289 { \
290  char* buffer = NULL; \
291  ssize_t bytes_read; \
292  \
293  SG_SET_LOCALE_C; \
294  bytes_read = buf->read_line(buffer); \
295  \
296  if (bytes_read<=1) \
297  { \
298  vector=NULL; \
299  len=-1; \
300  SG_RESET_LOCALE; \
301  return; \
302  } \
303  \
304  SG_DEBUG("Line read from the file:\n%s\n", buffer) \
305  /* Remove the terminating \n */ \
306  if (buffer[bytes_read-1]=='\n') \
307  { \
308  len=bytes_read-1; \
309  buffer[bytes_read-1]='\0'; \
310  } \
311  else \
312  len=bytes_read; \
313  vector=(sg_type *) buffer; \
314  SG_RESET_LOCALE; \
315 }
316 
317 GET_STRING(get_bool_string, str_to_bool, bool)
318 GET_STRING(get_byte_string, atoi, uint8_t)
319 GET_STRING(get_char_string, atoi, char)
320 GET_STRING(get_int_string, atoi, int32_t)
321 GET_STRING(get_shortreal_string, atof, float32_t)
322 GET_STRING(get_real_string, atof, float64_t)
323 GET_STRING(get_short_string, atoi, int16_t)
324 GET_STRING(get_word_string, atoi, uint16_t)
325 GET_STRING(get_int8_string, atoi, int8_t)
326 GET_STRING(get_uint_string, atoi, uint32_t)
327 GET_STRING(get_long_string, atoi, int64_t)
328 GET_STRING(get_ulong_string, atoi, uint64_t)
329 GET_STRING(get_longreal_string, atoi, floatmax_t)
330 #undef GET_STRING
331 
332 /* Methods for reading a string vector and a label from an ascii file */
333 
334 #define GET_STRING_AND_LABEL(fname, conv, sg_type) \
335 void CStreamingAsciiFile::get_string_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
336 { \
337  char* buffer = NULL; \
338  ssize_t bytes_read; \
339  \
340  SG_SET_LOCALE_C; \
341  bytes_read = buf->read_line(buffer); \
342  \
343  if (bytes_read<=1) \
344  { \
345  vector=NULL; \
346  len=-1; \
347  SG_RESET_LOCALE; \
348  return; \
349  } \
350  \
351  int32_t str_start_pos=-1; \
352  \
353  for (int32_t i=0; i<bytes_read; i++) \
354  { \
355  if (buffer[i] == ' ') \
356  { \
357  buffer[i]='\0'; \
358  label=atoi(buffer); \
359  buffer[i]=' '; \
360  str_start_pos=i+1; \
361  break; \
362  } \
363  } \
364  /* If no label found, set vector=NULL and length=-1 */ \
365  if (str_start_pos == -1) \
366  { \
367  vector=NULL; \
368  len=-1; \
369  return; \
370  } \
371  /* Remove terminating \n */ \
372  if (buffer[bytes_read-1]=='\n') \
373  { \
374  buffer[bytes_read-1]='\0'; \
375  len=bytes_read-str_start_pos-1; \
376  } \
377  else \
378  len=bytes_read-str_start_pos; \
379  \
380  vector=(sg_type*) &buffer[str_start_pos]; \
381  SG_RESET_LOCALE; \
382 }
383 
384 GET_STRING_AND_LABEL(get_bool_string_and_label, str_to_bool, bool)
385 GET_STRING_AND_LABEL(get_byte_string_and_label, atoi, uint8_t)
386 GET_STRING_AND_LABEL(get_char_string_and_label, atoi, char)
387 GET_STRING_AND_LABEL(get_int_string_and_label, atoi, int32_t)
388 GET_STRING_AND_LABEL(get_shortreal_string_and_label, atof, float32_t)
389 GET_STRING_AND_LABEL(get_real_string_and_label, atof, float64_t)
390 GET_STRING_AND_LABEL(get_short_string_and_label, atoi, int16_t)
391 GET_STRING_AND_LABEL(get_word_string_and_label, atoi, uint16_t)
392 GET_STRING_AND_LABEL(get_int8_string_and_label, atoi, int8_t)
393 GET_STRING_AND_LABEL(get_uint_string_and_label, atoi, uint32_t)
394 GET_STRING_AND_LABEL(get_long_string_and_label, atoi, int64_t)
395 GET_STRING_AND_LABEL(get_ulong_string_and_label, atoi, uint64_t)
396 GET_STRING_AND_LABEL(get_longreal_string_and_label, atoi, floatmax_t)
397 #undef GET_STRING_AND_LABEL
398 
399 /* Methods for reading a sparse vector from an ascii file */
400 
401 #define GET_SPARSE_VECTOR(fname, conv, sg_type) \
402 void CStreamingAsciiFile::get_sparse_vector(SGSparseVectorEntry<sg_type>*& vector, int32_t& len) \
403 { \
404  char* buffer = NULL; \
405  ssize_t bytes_read; \
406  SG_SET_LOCALE_C; \
407  \
408  bytes_read = buf->read_line(buffer); \
409  \
410  if (bytes_read<=1) \
411  { \
412  vector=NULL; \
413  len=-1; \
414  SG_RESET_LOCALE; \
415  return; \
416  } \
417  \
418  /* Remove terminating \n */ \
419  int32_t num_chars; \
420  if (buffer[bytes_read-1]=='\n') \
421  { \
422  num_chars=bytes_read-1; \
423  buffer[num_chars]='\0'; \
424  } \
425  else \
426  num_chars=bytes_read; \
427  \
428  int32_t num_dims=0; \
429  for (int32_t i=0; i<num_chars; i++) \
430  { \
431  if (buffer[i]==':') \
432  { \
433  num_dims++; \
434  } \
435  } \
436  \
437  int32_t index_start_pos=-1; \
438  int32_t feature_start_pos; \
439  int32_t current_feat=0; \
440  if (len < num_dims) \
441  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
442  for (int32_t i=0; i<num_chars; i++) \
443  { \
444  if (buffer[i]==':') \
445  { \
446  buffer[i]='\0'; \
447  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
448  /* Unset index_start_pos */ \
449  index_start_pos=-1; \
450  \
451  feature_start_pos=i+1; \
452  while ((buffer[i]!=' ') && (i<num_chars)) \
453  { \
454  i++; \
455  } \
456  \
457  buffer[i]='\0'; \
458  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
459  \
460  current_feat++; \
461  } \
462  else if (buffer[i]==' ') \
463  i++; \
464  else \
465  { \
466  /* Set index_start_pos if not set already */ \
467  /* if already set, it means the index is */ \
468  /* more than one digit long. */ \
469  if (index_start_pos == -1) \
470  index_start_pos=i; \
471  } \
472  } \
473  \
474  len=current_feat; \
475  SG_RESET_LOCALE; \
476 }
477 
478 GET_SPARSE_VECTOR(get_bool_sparse_vector, str_to_bool, bool)
479 GET_SPARSE_VECTOR(get_byte_sparse_vector, atoi, uint8_t)
480 GET_SPARSE_VECTOR(get_char_sparse_vector, atoi, char)
481 GET_SPARSE_VECTOR(get_int_sparse_vector, atoi, int32_t)
482 GET_SPARSE_VECTOR(get_shortreal_sparse_vector, atof, float32_t)
483 GET_SPARSE_VECTOR(get_real_sparse_vector, atof, float64_t)
484 GET_SPARSE_VECTOR(get_short_sparse_vector, atoi, int16_t)
485 GET_SPARSE_VECTOR(get_word_sparse_vector, atoi, uint16_t)
486 GET_SPARSE_VECTOR(get_int8_sparse_vector, atoi, int8_t)
487 GET_SPARSE_VECTOR(get_uint_sparse_vector, atoi, uint32_t)
488 GET_SPARSE_VECTOR(get_long_sparse_vector, atoi, int64_t)
489 GET_SPARSE_VECTOR(get_ulong_sparse_vector, atoi, uint64_t)
490 GET_SPARSE_VECTOR(get_longreal_sparse_vector, atoi, floatmax_t)
491 #undef GET_SPARSE_VECTOR
492 
493 /* Methods for reading a sparse vector and a label from an ascii file */
494 
495 #define GET_SPARSE_VECTOR_AND_LABEL(fname, conv, sg_type) \
496 void CStreamingAsciiFile::get_sparse_vector_and_label(SGSparseVectorEntry<sg_type>*& vector, int32_t& len, float64_t& label) \
497 { \
498  char* buffer = NULL; \
499  ssize_t bytes_read; \
500  SG_SET_LOCALE_C; \
501  \
502  bytes_read = buf->read_line(buffer); \
503  \
504  if (bytes_read<=1) \
505  { \
506  vector=NULL; \
507  len=-1; \
508  SG_RESET_LOCALE; \
509  return; \
510  } \
511  \
512  /* Remove terminating \n */ \
513  int32_t num_chars; \
514  if (buffer[bytes_read-1]=='\n') \
515  { \
516  num_chars=bytes_read-1; \
517  buffer[num_chars]='\0'; \
518  } \
519  else \
520  num_chars=bytes_read; \
521  \
522  int32_t num_dims=0; \
523  for (int32_t i=0; i<num_chars; i++) \
524  { \
525  if (buffer[i]==':') \
526  { \
527  num_dims++; \
528  } \
529  } \
530  \
531  int32_t index_start_pos=-1; \
532  int32_t feature_start_pos; \
533  int32_t current_feat=0; \
534  int32_t label_pos=-1; \
535  if (len < num_dims) \
536  vector=SG_REALLOC(SGSparseVectorEntry<sg_type>, vector, len, num_dims); \
537  \
538  for (int32_t i=1; i<num_chars; i++) \
539  { \
540  if (buffer[i]==':') \
541  { \
542  break; \
543  } \
544  if ( (buffer[i]==' ') && (buffer[i-1]!=' ') ) \
545  { \
546  buffer[i]='\0'; \
547  label_pos=i; \
548  label=atof(buffer); \
549  break; \
550  } \
551  } \
552  \
553  if (label_pos==-1) \
554  SG_ERROR("No label found!\n") \
555  \
556  buffer+=label_pos+1; \
557  num_chars-=label_pos+1; \
558  for (int32_t i=0; i<num_chars; i++) \
559  { \
560  if (buffer[i]==':') \
561  { \
562  buffer[i]='\0'; \
563  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
564  /* Unset index_start_pos */ \
565  index_start_pos=-1; \
566  \
567  feature_start_pos=i+1; \
568  while ((buffer[i]!=' ') && (i<num_chars)) \
569  { \
570  i++; \
571  } \
572  \
573  buffer[i]='\0'; \
574  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
575  \
576  current_feat++; \
577  } \
578  else if (buffer[i]==' ') \
579  i++; \
580  else \
581  { \
582  /* Set index_start_pos if not set already */ \
583  /* if already set, it means the index is */ \
584  /* more than one digit long. */ \
585  if (index_start_pos == -1) \
586  index_start_pos=i; \
587  } \
588  } \
589  \
590  len=current_feat; \
591  SG_RESET_LOCALE; \
592 }
593 
594 GET_SPARSE_VECTOR_AND_LABEL(get_bool_sparse_vector_and_label, str_to_bool, bool)
595 GET_SPARSE_VECTOR_AND_LABEL(get_byte_sparse_vector_and_label, atoi, uint8_t)
596 GET_SPARSE_VECTOR_AND_LABEL(get_char_sparse_vector_and_label, atoi, char)
597 GET_SPARSE_VECTOR_AND_LABEL(get_int_sparse_vector_and_label, atoi, int32_t)
598 GET_SPARSE_VECTOR_AND_LABEL(get_shortreal_sparse_vector_and_label, atof, float32_t)
599 GET_SPARSE_VECTOR_AND_LABEL(get_real_sparse_vector_and_label, atof, float64_t)
600 GET_SPARSE_VECTOR_AND_LABEL(get_short_sparse_vector_and_label, atoi, int16_t)
601 GET_SPARSE_VECTOR_AND_LABEL(get_word_sparse_vector_and_label, atoi, uint16_t)
602 GET_SPARSE_VECTOR_AND_LABEL(get_int8_sparse_vector_and_label, atoi, int8_t)
603 GET_SPARSE_VECTOR_AND_LABEL(get_uint_sparse_vector_and_label, atoi, uint32_t)
604 GET_SPARSE_VECTOR_AND_LABEL(get_long_sparse_vector_and_label, atoi, int64_t)
605 GET_SPARSE_VECTOR_AND_LABEL(get_ulong_sparse_vector_and_label, atoi, uint64_t)
606 GET_SPARSE_VECTOR_AND_LABEL(get_longreal_sparse_vector_and_label, atoi, floatmax_t)
607 #undef GET_SPARSE_VECTOR_AND_LABEL
608 
609 template <class T>
610 void CStreamingAsciiFile::append_item(
611  DynArray<T>* items, char* ptr_data, char* ptr_item)
612 {
613  size_t len=(ptr_data-ptr_item)/sizeof(char);
614  char* item=SG_MALLOC(char, len+1);
615  memset(item, 0, sizeof(char)*(len+1));
616  item=strncpy(item, ptr_item, len);
617 
618  SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item)
619  items->append_element(item);
620 }
621 
623 {
624  m_delimiter = delimiter;
625 }

SHOGUN Machine Learning Toolbox - Documentation