SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingAsciiFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
13 
14 #include <ctype.h>
15 
16 using namespace shogun;
17 
19  : CStreamingFile()
20 {
21  SG_UNSTABLE("CStreamingAsciiFile::CStreamingAsciiFile()", "\n");
22 }
23 
24 CStreamingAsciiFile::CStreamingAsciiFile(const char* fname, char rw)
25  : CStreamingFile(fname, rw)
26 {
27 }
28 
30 {
31 }
32 
33 /* Methods for reading dense vectors from an ascii file */
34 
35 #define GET_VECTOR(fname, conv, sg_type) \
36 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& num_feat) \
37 { \
38  char* buffer = NULL; \
39  ssize_t bytes_read; \
40  int32_t old_len = num_feat; \
41  \
42  bytes_read = buf->read_line(buffer); \
43  \
44  if (bytes_read<=0) \
45  { \
46  vector=NULL; \
47  num_feat=-1; \
48  return; \
49  } \
50  \
51  /* determine num_feat, populate dynamic array */ \
52  int32_t nf=0; \
53  num_feat=0; \
54  \
55  char* ptr_item=NULL; \
56  char* ptr_data=buffer; \
57  DynArray<char*>* items=new DynArray<char*>(); \
58  \
59  while (*ptr_data) \
60  { \
61  if ((*ptr_data=='\n') || \
62  (ptr_data - buffer >= bytes_read)) \
63  { \
64  if (ptr_item) \
65  nf++; \
66  \
67  append_item(items, ptr_data, ptr_item); \
68  num_feat=nf; \
69  \
70  nf=0; \
71  ptr_item=NULL; \
72  break; \
73  } \
74  else if (!isblank(*ptr_data) && !ptr_item) \
75  { \
76  ptr_item=ptr_data; \
77  } \
78  else if (isblank(*ptr_data) && ptr_item) \
79  { \
80  append_item(items, ptr_data, ptr_item); \
81  ptr_item=NULL; \
82  nf++; \
83  } \
84  \
85  ptr_data++; \
86  } \
87  \
88  SG_DEBUG("num_feat %d\n", num_feat); \
89  \
90  /* now copy data into vector */ \
91  if (old_len < num_feat) \
92  vector=SG_REALLOC(sg_type, vector, num_feat); \
93  \
94  for (int32_t i=0; i<num_feat; i++) \
95  { \
96  char* item=items->get_element(i); \
97  vector[i]=conv(item); \
98  SG_FREE(item); \
99  } \
100  delete items; \
101 }
102 
103 GET_VECTOR(get_bool_vector, str_to_bool, bool)
104 GET_VECTOR(get_byte_vector, atoi, uint8_t)
105 GET_VECTOR(get_char_vector, atoi, char)
106 GET_VECTOR(get_int_vector, atoi, int32_t)
107 GET_VECTOR(get_short_vector, atoi, int16_t)
108 GET_VECTOR(get_word_vector, atoi, uint16_t)
109 GET_VECTOR(get_int8_vector, atoi, int8_t)
110 GET_VECTOR(get_uint_vector, atoi, uint32_t)
111 GET_VECTOR(get_long_vector, atoi, int64_t)
112 GET_VECTOR(get_ulong_vector, atoi, uint64_t)
113 GET_VECTOR(get_longreal_vector, atoi, floatmax_t)
114 #undef GET_VECTOR
115 
116 #define GET_FLOAT_VECTOR(sg_type) \
117  void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& len)\
118  { \
119  char *line=NULL; \
120  int32_t num_chars = buf->read_line(line); \
121  int32_t old_len = len; \
122  \
123  if (num_chars == 0) \
124  { \
125  len = -1; \
126  return; \
127  } \
128  \
129  substring example_string = {line, line + num_chars}; \
130  \
131  CAsciiFile::tokenize(' ', example_string, words); \
132  \
133  len = words.index(); \
134  substring* feature_start = &words[0]; \
135  \
136  if (len > old_len) \
137  vector = SG_REALLOC(sg_type, vector, len); \
138  \
139  int32_t j=0; \
140  for (substring* i = feature_start; i != words.end; i++) \
141  { \
142  vector[j++] = float_of_substring(*i); \
143  } \
144  }
145 
148 #undef GET_FLOAT_VECTOR
149 
150 /* Methods for reading a dense vector and a label from an ascii file */
151 
152 #define GET_VECTOR_AND_LABEL(fname, conv, sg_type) \
153  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& num_feat, float64_t& label) \
154  { \
155  char* buffer = NULL; \
156  ssize_t bytes_read; \
157  int32_t old_len = num_feat; \
158  \
159  bytes_read = buf->read_line(buffer); \
160  \
161  if (bytes_read<=0) \
162  { \
163  vector=NULL; \
164  num_feat=-1; \
165  return; \
166  } \
167  \
168  /* determine num_feat, populate dynamic array */ \
169  int32_t nf=0; \
170  num_feat=0; \
171  \
172  char* ptr_item=NULL; \
173  char* ptr_data=buffer; \
174  DynArray<char*>* items=new DynArray<char*>(); \
175  \
176  while (*ptr_data) \
177  { \
178  if ((*ptr_data=='\n') || \
179  (ptr_data - buffer >= bytes_read)) \
180  { \
181  if (ptr_item) \
182  nf++; \
183  \
184  append_item(items, ptr_data, ptr_item); \
185  num_feat=nf; \
186  \
187  nf=0; \
188  ptr_item=NULL; \
189  break; \
190  } \
191  else if (!isblank(*ptr_data) && !ptr_item) \
192  { \
193  ptr_item=ptr_data; \
194  } \
195  else if (isblank(*ptr_data) && ptr_item) \
196  { \
197  append_item(items, ptr_data, ptr_item); \
198  ptr_item=NULL; \
199  nf++; \
200  } \
201  \
202  ptr_data++; \
203  } \
204  \
205  SG_DEBUG("num_feat %d\n", num_feat); \
206  /* The first element is the label */ \
207  label=atof(items->get_element(0)); \
208  /* now copy rest of the data into vector */ \
209  if (old_len < num_feat - 1) \
210  vector=SG_REALLOC(sg_type, vector, num_feat-1); \
211  \
212  for (int32_t i=1; i<num_feat; i++) \
213  { \
214  char* item=items->get_element(i); \
215  vector[i-1]=conv(item); \
216  SG_FREE(item); \
217  } \
218  delete items; \
219  num_feat--; \
220  }
221 
222 GET_VECTOR_AND_LABEL(get_bool_vector_and_label, str_to_bool, bool)
223 GET_VECTOR_AND_LABEL(get_byte_vector_and_label, atoi, uint8_t)
224 GET_VECTOR_AND_LABEL(get_char_vector_and_label, atoi, char)
225 GET_VECTOR_AND_LABEL(get_int_vector_and_label, atoi, int32_t)
226 GET_VECTOR_AND_LABEL(get_short_vector_and_label, atoi, int16_t)
227 GET_VECTOR_AND_LABEL(get_word_vector_and_label, atoi, uint16_t)
228 GET_VECTOR_AND_LABEL(get_int8_vector_and_label, atoi, int8_t)
229 GET_VECTOR_AND_LABEL(get_uint_vector_and_label, atoi, uint32_t)
230 GET_VECTOR_AND_LABEL(get_long_vector_and_label, atoi, int64_t)
231 GET_VECTOR_AND_LABEL(get_ulong_vector_and_label, atoi, uint64_t)
232 GET_VECTOR_AND_LABEL(get_longreal_vector_and_label, atoi, floatmax_t)
233 #undef GET_VECTOR_AND_LABEL
234 
235 #define GET_FLOAT_VECTOR_AND_LABEL(sg_type) \
236  void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
237  { \
238  char *line=NULL; \
239  int32_t num_chars = buf->read_line(line); \
240  int32_t old_len = len; \
241  \
242  if (num_chars == 0) \
243  { \
244  len = -1; \
245  return; \
246  } \
247  \
248  substring example_string = {line, line + num_chars}; \
249  \
250  CAsciiFile::tokenize(' ', example_string, words); \
251  \
252  label = float_of_substring(words[0]); \
253  \
254  len = words.index() - 1; \
255  substring* feature_start = &words[1]; \
256  \
257  if (len > old_len) \
258  vector = SG_REALLOC(sg_type, vector, len); \
259  \
260  int32_t j=0; \
261  for (substring* i = feature_start; i != words.end; i++) \
262  { \
263  vector[j++] = float_of_substring(*i); \
264  } \
265  }
266 
269 #undef GET_FLOAT_VECTOR_AND_LABEL
270 
271 /* Methods for reading a string vector from an ascii file (see StringFeatures) */
272 
273 #define GET_STRING(fname, conv, sg_type) \
274 void CStreamingAsciiFile::get_string(sg_type*& vector, int32_t& len) \
275 { \
276  char* buffer = NULL; \
277  ssize_t bytes_read; \
278  \
279  bytes_read = buf->read_line(buffer); \
280  \
281  if (bytes_read<=1) \
282  { \
283  vector=NULL; \
284  len=-1; \
285  return; \
286  } \
287  \
288  SG_DEBUG("Line read from the file:\n%s\n", buffer); \
289  /* Remove the terminating \n */ \
290  if (buffer[bytes_read-1]=='\n') \
291  { \
292  len=bytes_read-1; \
293  buffer[bytes_read-1]='\0'; \
294  } \
295  else \
296  len=bytes_read; \
297  vector=(sg_type *) buffer; \
298 }
299 
300 GET_STRING(get_bool_string, str_to_bool, bool)
301 GET_STRING(get_byte_string, atoi, uint8_t)
302 GET_STRING(get_char_string, atoi, char)
303 GET_STRING(get_int_string, atoi, int32_t)
304 GET_STRING(get_shortreal_string, atof, float32_t)
305 GET_STRING(get_real_string, atof, float64_t)
306 GET_STRING(get_short_string, atoi, int16_t)
307 GET_STRING(get_word_string, atoi, uint16_t)
308 GET_STRING(get_int8_string, atoi, int8_t)
309 GET_STRING(get_uint_string, atoi, uint32_t)
310 GET_STRING(get_long_string, atoi, int64_t)
311 GET_STRING(get_ulong_string, atoi, uint64_t)
312 GET_STRING(get_longreal_string, atoi, floatmax_t)
313 #undef GET_STRING
314 
315 /* Methods for reading a string vector and a label from an ascii file */
316 
317 #define GET_STRING_AND_LABEL(fname, conv, sg_type) \
318 void CStreamingAsciiFile::get_string_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
319 { \
320  char* buffer = NULL; \
321  ssize_t bytes_read; \
322  \
323  bytes_read = buf->read_line(buffer); \
324  \
325  if (bytes_read<=1) \
326  { \
327  vector=NULL; \
328  len=-1; \
329  return; \
330  } \
331  \
332  int32_t str_start_pos=-1; \
333  \
334  for (int32_t i=0; i<bytes_read; i++) \
335  { \
336  if (buffer[i] == ' ') \
337  { \
338  buffer[i]='\0'; \
339  label=atoi(buffer); \
340  buffer[i]=' '; \
341  str_start_pos=i+1; \
342  break; \
343  } \
344  } \
345  /* If no label found, set vector=NULL and length=-1 */ \
346  if (str_start_pos == -1) \
347  { \
348  vector=NULL; \
349  len=-1; \
350  return; \
351  } \
352  /* Remove terminating \n */ \
353  if (buffer[bytes_read-1]=='\n') \
354  { \
355  buffer[bytes_read-1]='\0'; \
356  len=bytes_read-str_start_pos-1; \
357  } \
358  else \
359  len=bytes_read-str_start_pos; \
360  \
361  vector=(sg_type*) &buffer[str_start_pos]; \
362 }
363 
364 GET_STRING_AND_LABEL(get_bool_string_and_label, str_to_bool, bool)
365 GET_STRING_AND_LABEL(get_byte_string_and_label, atoi, uint8_t)
366 GET_STRING_AND_LABEL(get_char_string_and_label, atoi, char)
367 GET_STRING_AND_LABEL(get_int_string_and_label, atoi, int32_t)
368 GET_STRING_AND_LABEL(get_shortreal_string_and_label, atof, float32_t)
369 GET_STRING_AND_LABEL(get_real_string_and_label, atof, float64_t)
370 GET_STRING_AND_LABEL(get_short_string_and_label, atoi, int16_t)
371 GET_STRING_AND_LABEL(get_word_string_and_label, atoi, uint16_t)
372 GET_STRING_AND_LABEL(get_int8_string_and_label, atoi, int8_t)
373 GET_STRING_AND_LABEL(get_uint_string_and_label, atoi, uint32_t)
374 GET_STRING_AND_LABEL(get_long_string_and_label, atoi, int64_t)
375 GET_STRING_AND_LABEL(get_ulong_string_and_label, atoi, uint64_t)
376 GET_STRING_AND_LABEL(get_longreal_string_and_label, atoi, floatmax_t)
377 #undef GET_STRING_AND_LABEL
378 
379 /* Methods for reading a sparse vector from an ascii file */
380 
381 #define GET_SPARSE_VECTOR(fname, conv, sg_type) \
382 void CStreamingAsciiFile::get_sparse_vector(SGSparseVectorEntry<sg_type>*& vector, int32_t& len) \
383 { \
384  char* buffer = NULL; \
385  ssize_t bytes_read; \
386  \
387  bytes_read = buf->read_line(buffer); \
388  \
389  if (bytes_read<=1) \
390  { \
391  vector=NULL; \
392  len=-1; \
393  return; \
394  } \
395  \
396  /* Remove terminating \n */ \
397  int32_t num_chars; \
398  if (buffer[bytes_read-1]=='\n') \
399  { \
400  num_chars=bytes_read-1; \
401  buffer[num_chars]='\0'; \
402  } \
403  else \
404  num_chars=bytes_read; \
405  \
406  int32_t num_dims=0; \
407  for (int32_t i=0; i<num_chars; i++) \
408  { \
409  if (buffer[i]==':') \
410  { \
411  num_dims++; \
412  } \
413  } \
414  \
415  int32_t index_start_pos=-1; \
416  int32_t feature_start_pos; \
417  int32_t current_feat=0; \
418  vector=SG_MALLOC(SGSparseVectorEntry<sg_type>, num_dims); \
419  for (int32_t i=0; i<num_chars; i++) \
420  { \
421  if (buffer[i]==':') \
422  { \
423  buffer[i]='\0'; \
424  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
425  /* Unset index_start_pos */ \
426  index_start_pos=-1; \
427  \
428  feature_start_pos=i+1; \
429  while ((buffer[i]!=' ') && (i<num_chars)) \
430  { \
431  i++; \
432  } \
433  \
434  buffer[i]='\0'; \
435  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
436  \
437  current_feat++; \
438  } \
439  else if (buffer[i]==' ') \
440  i++; \
441  else \
442  { \
443  /* Set index_start_pos if not set already */ \
444  /* if already set, it means the index is */ \
445  /* more than one digit long. */ \
446  if (index_start_pos == -1) \
447  index_start_pos=i; \
448  } \
449  } \
450  \
451  len=current_feat; \
452 }
453 
454 GET_SPARSE_VECTOR(get_bool_sparse_vector, str_to_bool, bool)
455 GET_SPARSE_VECTOR(get_byte_sparse_vector, atoi, uint8_t)
456 GET_SPARSE_VECTOR(get_char_sparse_vector, atoi, char)
457 GET_SPARSE_VECTOR(get_int_sparse_vector, atoi, int32_t)
458 GET_SPARSE_VECTOR(get_shortreal_sparse_vector, atof, float32_t)
459 GET_SPARSE_VECTOR(get_real_sparse_vector, atof, float64_t)
460 GET_SPARSE_VECTOR(get_short_sparse_vector, atoi, int16_t)
461 GET_SPARSE_VECTOR(get_word_sparse_vector, atoi, uint16_t)
462 GET_SPARSE_VECTOR(get_int8_sparse_vector, atoi, int8_t)
463 GET_SPARSE_VECTOR(get_uint_sparse_vector, atoi, uint32_t)
464 GET_SPARSE_VECTOR(get_long_sparse_vector, atoi, int64_t)
465 GET_SPARSE_VECTOR(get_ulong_sparse_vector, atoi, uint64_t)
466 GET_SPARSE_VECTOR(get_longreal_sparse_vector, atoi, floatmax_t)
467 #undef GET_SPARSE_VECTOR
468 
469 /* Methods for reading a sparse vector and a label from an ascii file */
470 
471 #define GET_SPARSE_VECTOR_AND_LABEL(fname, conv, sg_type) \
472 void CStreamingAsciiFile::get_sparse_vector_and_label(SGSparseVectorEntry<sg_type>*& vector, int32_t& len, float64_t& label) \
473 { \
474  char* buffer = NULL; \
475  ssize_t bytes_read; \
476  \
477  bytes_read = buf->read_line(buffer); \
478  \
479  if (bytes_read<=1) \
480  { \
481  vector=NULL; \
482  len=-1; \
483  return; \
484  } \
485  \
486  /* Remove terminating \n */ \
487  int32_t num_chars; \
488  if (buffer[bytes_read-1]=='\n') \
489  { \
490  num_chars=bytes_read-1; \
491  buffer[num_chars]='\0'; \
492  } \
493  else \
494  num_chars=bytes_read; \
495  \
496  int32_t num_dims=0; \
497  for (int32_t i=0; i<num_chars; i++) \
498  { \
499  if (buffer[i]==':') \
500  { \
501  num_dims++; \
502  } \
503  } \
504  \
505  int32_t index_start_pos=-1; \
506  int32_t feature_start_pos; \
507  int32_t current_feat=0; \
508  int32_t label_pos=-1; \
509  vector=SG_MALLOC(SGSparseVectorEntry<sg_type>, num_dims); \
510  \
511  for (int32_t i=1; i<num_chars; i++) \
512  { \
513  if (buffer[i]==':') \
514  { \
515  break; \
516  } \
517  if ( (buffer[i]==' ') && (buffer[i-1]!=' ') ) \
518  { \
519  buffer[i]='\0'; \
520  label_pos=i; \
521  label=atof(buffer); \
522  break; \
523  } \
524  } \
525  \
526  if (label_pos==-1) \
527  SG_ERROR("No label found!\n"); \
528  \
529  buffer+=label_pos+1; \
530  num_chars-=label_pos+1; \
531  for (int32_t i=0; i<num_chars; i++) \
532  { \
533  if (buffer[i]==':') \
534  { \
535  buffer[i]='\0'; \
536  vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
537  /* Unset index_start_pos */ \
538  index_start_pos=-1; \
539  \
540  feature_start_pos=i+1; \
541  while ((buffer[i]!=' ') && (i<num_chars)) \
542  { \
543  i++; \
544  } \
545  \
546  buffer[i]='\0'; \
547  vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
548  \
549  current_feat++; \
550  } \
551  else if (buffer[i]==' ') \
552  i++; \
553  else \
554  { \
555  /* Set index_start_pos if not set already */ \
556  /* if already set, it means the index is */ \
557  /* more than one digit long. */ \
558  if (index_start_pos == -1) \
559  index_start_pos=i; \
560  } \
561  } \
562  \
563  len=current_feat; \
564 }
565 
566 GET_SPARSE_VECTOR_AND_LABEL(get_bool_sparse_vector_and_label, str_to_bool, bool)
567 GET_SPARSE_VECTOR_AND_LABEL(get_byte_sparse_vector_and_label, atoi, uint8_t)
568 GET_SPARSE_VECTOR_AND_LABEL(get_char_sparse_vector_and_label, atoi, char)
569 GET_SPARSE_VECTOR_AND_LABEL(get_int_sparse_vector_and_label, atoi, int32_t)
570 GET_SPARSE_VECTOR_AND_LABEL(get_shortreal_sparse_vector_and_label, atof, float32_t)
571 GET_SPARSE_VECTOR_AND_LABEL(get_real_sparse_vector_and_label, atof, float64_t)
572 GET_SPARSE_VECTOR_AND_LABEL(get_short_sparse_vector_and_label, atoi, int16_t)
573 GET_SPARSE_VECTOR_AND_LABEL(get_word_sparse_vector_and_label, atoi, uint16_t)
574 GET_SPARSE_VECTOR_AND_LABEL(get_int8_sparse_vector_and_label, atoi, int8_t)
575 GET_SPARSE_VECTOR_AND_LABEL(get_uint_sparse_vector_and_label, atoi, uint32_t)
576 GET_SPARSE_VECTOR_AND_LABEL(get_long_sparse_vector_and_label, atoi, int64_t)
577 GET_SPARSE_VECTOR_AND_LABEL(get_ulong_sparse_vector_and_label, atoi, uint64_t)
578 GET_SPARSE_VECTOR_AND_LABEL(get_longreal_sparse_vector_and_label, atoi, floatmax_t)
579 #undef GET_SPARSE_VECTOR_AND_LABEL
580 
581 template <class T>
582 void CStreamingAsciiFile::append_item(
583  DynArray<T>* items, char* ptr_data, char* ptr_item)
584 {
585  size_t len=(ptr_data-ptr_item)/sizeof(char);
586  char* item=SG_MALLOC(char, len+1);
587  memset(item, 0, sizeof(char)*(len+1));
588  item=strncpy(item, ptr_item, len);
589 
590  SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
591  items->append_element(item);
592 }

SHOGUN Machine Learning Toolbox - Documentation