SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
AsciiFile.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Parts of this code are copyright (c) 2009 Yahoo! Inc.
8  * All rights reserved. The copyrights embodied in the content of
9  * this file are licensed under the BSD (revised) open source license.
10  *
11  * Written (W) 2010 Soeren Sonnenburg
12  * Copyright (C) 2010 Berlin Institute of Technology
13  */
14 
16 #include <shogun/io/File.h>
17 #include <shogun/io/AsciiFile.h>
19 #include <ctype.h>
20 #include <stdio.h>
21 
22 using namespace shogun;
23 
25 {
26  SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n");
27 }
28 
29 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
30 {
31 }
32 
33 CAsciiFile::CAsciiFile(const char* fname, char rw, const char* name) : CFile(fname, rw, name)
34 {
35 }
36 
38 {
39 }
40 
41 #define GET_VECTOR(fname, mfname, sg_type) \
42 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
43 { \
44  vec=NULL; \
45  len=0; \
46  int32_t num_feat=0; \
47  int32_t num_vec=0; \
48  mfname(vec, num_feat, num_vec); \
49  if ((num_feat==1) || (num_vec==1)) \
50  { \
51  if (num_feat==1) \
52  len=num_vec; \
53  else \
54  len=num_feat; \
55  } \
56  else \
57  { \
58  SG_FREE(vec); \
59  vec=NULL; \
60  len=0; \
61  SG_ERROR("Could not read vector from" \
62  " file %s (shape %dx%d found but " \
63  "vector expected).\n", filename, \
64  num_vec, num_feat); \
65  } \
66 }
67 
68 GET_VECTOR(get_vector, get_matrix, uint8_t)
69 GET_VECTOR(get_vector, get_matrix, char)
70 GET_VECTOR(get_vector, get_matrix, int32_t)
71 GET_VECTOR(get_vector, get_matrix, float32_t)
72 GET_VECTOR(get_vector, get_matrix, float64_t)
73 GET_VECTOR(get_vector, get_matrix, int16_t)
74 GET_VECTOR(get_vector, get_matrix, uint16_t)
75 #undef GET_VECTOR
76 
77 #define GET_MATRIX(fname, conv, sg_type) \
78 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
79 { \
80  struct stat stats; \
81  if (stat(filename, &stats)!=0) \
82  SG_ERROR("Could not get file statistics.\n"); \
83  \
84  char* data=SG_MALLOC(char, stats.st_size+1); \
85  memset(data, 0, sizeof(char)*(stats.st_size+1)); \
86  size_t nread=fread(data, sizeof(char), stats.st_size, file); \
87  if (nread<=0) \
88  SG_ERROR("Could not read data from %s.\n", filename); \
89  \
90  SG_DEBUG("data read from file:\n%s\n", data); \
91  \
92  /* determine num_feat and num_vec, populate dynamic array */ \
93  int32_t nf=0; \
94  num_feat=0; \
95  num_vec=0; \
96  char* ptr_item=NULL; \
97  char* ptr_data=data; \
98  DynArray<char*>* items=new DynArray<char*>(); \
99  \
100  while (*ptr_data) \
101  { \
102  if (*ptr_data=='\n') \
103  { \
104  if (ptr_item) \
105  nf++; \
106  \
107  if (num_feat!=0 && nf!=num_feat) \
108  SG_ERROR("Number of features mismatches (%d != %d) in vector" \
109  " %d in file %s.\n", num_feat, nf, num_vec, filename); \
110  \
111  append_item(items, ptr_data, ptr_item); \
112  num_feat=nf; \
113  num_vec++; \
114  nf=0; \
115  ptr_item=NULL; \
116  } \
117  else if (!isblank(*ptr_data) && !ptr_item) \
118  { \
119  ptr_item=ptr_data; \
120  } \
121  else if (isblank(*ptr_data) && ptr_item) \
122  { \
123  append_item(items, ptr_data, ptr_item); \
124  ptr_item=NULL; \
125  nf++; \
126  } \
127  \
128  ptr_data++; \
129  } \
130  \
131  SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec); \
132  SG_FREE(data); \
133  \
134  /* now copy data into matrix */ \
135  matrix=SG_MALLOC(sg_type, num_vec*num_feat); \
136  for (int32_t i=0; i<num_vec; i++) \
137  { \
138  for (int32_t j=0; j<num_feat; j++) \
139  { \
140  char* item=items->get_element(i*num_feat+j); \
141  matrix[i*num_feat+j]=conv(item); \
142  SG_FREE(item); \
143  } \
144  } \
145  delete items; \
146 }
147 
148 GET_MATRIX(get_matrix, atoi, uint8_t)
149 GET_MATRIX(get_int8_matrix, atoi, int8_t)
150 GET_MATRIX(get_matrix, atoi, char)
151 GET_MATRIX(get_matrix, atoi, int32_t)
152 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
153 GET_MATRIX(get_long_matrix, atoll, int64_t)
154 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
155 GET_MATRIX(get_matrix, atof, float32_t)
156 GET_MATRIX(get_matrix, atof, float64_t)
157 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
158 GET_MATRIX(get_matrix, atoi, int16_t)
159 GET_MATRIX(get_matrix, atoi, uint16_t)
160 #undef GET_MATRIX
161 
162 #define GET_NDARRAY(fname, conv, sg_type) \
163 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims) \
164 { \
165  struct stat stats; \
166  if (stat(filename, &stats)!=0) \
167  SG_ERROR("Could not get file statistics.\n"); \
168  \
169  char* data=SG_MALLOC(char, stats.st_size+1); \
170  memset(data, 0, sizeof(char)*(stats.st_size+1)); \
171  size_t nread=fread(data, sizeof(char), stats.st_size, file); \
172  if (nread<=0) \
173  SG_ERROR("Could not read data from %s.\n", filename); \
174  \
175  SG_DEBUG("data read from file:\n%s\n", data); \
176  \
177  /* determine size of array */ \
178  int32_t length=0; \
179  int32_t counter=0; \
180  size_t total=0; \
181  num_dims = -1; \
182  char* ptr_item=NULL; \
183  char* ptr_data=data; \
184  DynArray<char*>* items=new DynArray<char*>(); \
185  \
186  /* read line with sizes of array*/ \
187  while(*ptr_data != '\n') \
188  { \
189  if(isblank(*ptr_data) && ptr_item) \
190  { \
191  append_item(items, ptr_data, ptr_item); \
192  num_dims++; \
193  ptr_item = NULL; \
194  } \
195  else if(!isblank(*ptr_data) && !ptr_item) \
196  ptr_item = ptr_data; \
197  \
198  ptr_data++; \
199  } \
200  ptr_item = NULL; \
201  ptr_data++; \
202  \
203  /* read array data*/ \
204  while(*ptr_data) \
205  { \
206  if (*ptr_data=='\n') \
207  { \
208  if (ptr_item) \
209  counter++; \
210  \
211  if (length!=0 && counter!=length) \
212  SG_ERROR("Invalid number of data (%d != %d) in line" \
213  " %d in file %s.\n", length, counter, total, filename); \
214  \
215  append_item(items, ptr_data, ptr_item); \
216  length=counter; \
217  total++; \
218  counter=0; \
219  ptr_item=NULL; \
220  } \
221  else if (!isblank(*ptr_data) && !ptr_item) \
222  { \
223  ptr_item=ptr_data; \
224  } \
225  else if (isblank(*ptr_data) && ptr_item) \
226  { \
227  append_item(items, ptr_data, ptr_item); \
228  ptr_item=NULL; \
229  counter++; \
230  } \
231  \
232  ptr_data++; \
233  } \
234  \
235  SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total); \
236  SG_FREE(data); \
237  \
238  /* determining sizes of dimensions*/ \
239  char * item; \
240  item=items->get_element(0); \
241  if(atoi(item) != num_dims) \
242  SG_ERROR("Invalid number of dimensions!\n"); \
243  SG_FREE(item); \
244  dims = SG_MALLOC(int32_t, num_dims); \
245  for(int32_t i =0;i < num_dims;i++) \
246  { \
247  item = items->get_element(i+1); \
248  dims[i] = atoi(item); \
249  SG_FREE(item); \
250  } \
251  if (dims[num_dims-1] != length) \
252  SG_ERROR("Invalid number of lines in file!\n"); \
253  \
254  /* converting array data */ \
255  total *= length; \
256  array=SG_MALLOC(sg_type, total); \
257  for (size_t i=0; i<total; i++) \
258  { \
259  item=items->get_element(i+(num_dims+1)); \
260  array[i]=conv(item); \
261  SG_FREE(item); \
262  } \
263  delete items; \
264 }
265 
266 GET_NDARRAY(get_ndarray, atoi, uint8_t)
267 GET_NDARRAY(get_int8_ndarray, atoi, int8_t)
268 GET_NDARRAY(get_ndarray, atoi, char)
269 GET_NDARRAY(get_ndarray, atoi, int32_t)
270 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t)
271 GET_NDARRAY(get_long_ndarray, atoll, int64_t)
272 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t)
273 GET_NDARRAY(get_ndarray, atof, float32_t)
274 GET_NDARRAY(get_ndarray, atof, float64_t)
275 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t)
276 GET_NDARRAY(get_ndarray, atoi, int16_t)
277 GET_NDARRAY(get_ndarray, atoi, uint16_t)
278 #undef GET_NDARRAY
279 
280 #define GET_SPARSEMATRIX(fname, conv, sg_type) \
281 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
282 { \
283  size_t blocksize=1024*1024; \
284  size_t required_blocksize=blocksize; \
285  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); \
286  \
287  if (file) \
288  { \
289  num_vec=0; \
290  num_feat=0; \
291  \
292  SG_INFO("counting line numbers in file %s\n", filename); \
293  size_t sz=blocksize; \
294  size_t block_offs=0; \
295  size_t old_block_offs=0; \
296  fseek(file, 0, SEEK_END); \
297  size_t fsize=ftell(file); \
298  rewind(file); \
299  \
300  while (sz == blocksize) \
301  { \
302  sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
303  for (size_t i=0; i<sz; i++) \
304  { \
305  block_offs++; \
306  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
307  { \
308  num_vec++; \
309  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
310  old_block_offs=block_offs; \
311  } \
312  } \
313  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); \
314  } \
315  \
316  SG_INFO("found %d feature vectors\n", num_vec); \
317  SG_FREE(dummy); \
318  blocksize=required_blocksize; \
319  dummy = SG_MALLOC(uint8_t, blocksize+1); /*allow setting of '\0' at EOL*/ \
320  matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
321  for (int i=0; i<num_vec; i++) \
322  new (&matrix[i]) SGSparseVector<sg_type>(); \
323  rewind(file); \
324  sz=blocksize; \
325  int32_t lines=0; \
326  while (sz == blocksize) \
327  { \
328  sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
329  \
330  size_t old_sz=0; \
331  for (size_t i=0; i<sz; i++) \
332  { \
333  if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
334  { \
335  size_t len=i-old_sz+1; \
336  uint8_t* data=&dummy[old_sz]; \
337  \
338  for (size_t j=0; j<len; j++) \
339  dummy[j]=data[j]; \
340  \
341  sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file); \
342  i=0; \
343  old_sz=0; \
344  sz+=len; \
345  } \
346  \
347  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
348  { \
349  \
350  size_t len=i-old_sz; \
351  uint8_t* data=&dummy[old_sz]; \
352  \
353  int32_t dims=0; \
354  for (size_t j=0; j<len; j++) \
355  { \
356  if (data[j]==':') \
357  dims++; \
358  } \
359  \
360  if (dims<=0) \
361  { \
362  SG_ERROR("Error in line %d - number of" \
363  " dimensions is %d line is %d characters" \
364  " long\n line_content:'%.*s'\n", lines, \
365  dims, len, len, (const char*) data); \
366  } \
367  \
368  SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims); \
369  \
370  /* skip label part */ \
371  size_t j=0; \
372  for (; j<len; j++) \
373  { \
374  if (data[j]==':') \
375  { \
376  j=-1; /* file without label*/ \
377  break; \
378  } \
379  \
380  if (data[j]==' ') \
381  { \
382  data[j]='\0'; \
383  \
384  /* skip label part */ \
385  break; \
386  } \
387  } \
388  \
389  int32_t d=0; \
390  j++; \
391  uint8_t* start=&data[j]; \
392  for (; j<len; j++) \
393  { \
394  if (data[j]==':') \
395  { \
396  data[j]='\0'; \
397  \
398  feat[d].feat_index=(int32_t) atoi((const char*) start)-1; \
399  num_feat=CMath::max(num_feat, feat[d].feat_index+1); \
400  \
401  j++; \
402  start=&data[j]; \
403  for (; j<len; j++) \
404  { \
405  if (data[j]==' ' || data[j]=='\n') \
406  { \
407  data[j]='\0'; \
408  feat[d].entry=(sg_type) conv((const char*) start); \
409  d++; \
410  break; \
411  } \
412  } \
413  \
414  if (j==len) \
415  { \
416  data[j]='\0'; \
417  feat[dims-1].entry=(sg_type) conv((const char*) start); \
418  } \
419  \
420  j++; \
421  start=&data[j]; \
422  } \
423  } \
424  \
425  matrix[lines].num_feat_entries=dims; \
426  matrix[lines].features=feat; \
427  \
428  old_sz=i+1; \
429  lines++; \
430  SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t"); \
431  } \
432  } \
433  } \
434  \
435  SG_INFO("file successfully read\n"); \
436  } \
437  \
438  SG_FREE(dummy); \
439 }
440 
441 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool)
442 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t)
443 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
444 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char)
445 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t)
446 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
447 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
448 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
449 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t)
450 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t)
451 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
452 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t)
453 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t)
454 #undef GET_SPARSEMATRIX
455 
456 
457 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
458 {
459  size_t blocksize=1024*1024;
460  size_t required_blocksize=0;
461  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
462  uint8_t* overflow=NULL;
463  int32_t overflow_len=0;
464 
465  if (file)
466  {
467  num_str=0;
468  max_string_len=0;
469 
470  SG_INFO("counting line numbers in file %s\n", filename);
471  size_t sz=blocksize;
472  size_t block_offs=0;
473  size_t old_block_offs=0;
474  fseek(file, 0, SEEK_END);
475  size_t fsize=ftell(file);
476  rewind(file);
477 
478  while (sz == blocksize)
479  {
480  sz=fread(dummy, sizeof(uint8_t), blocksize, file);
481  for (size_t i=0; i<sz; i++)
482  {
483  block_offs++;
484  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
485  {
486  num_str++;
487  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
488  old_block_offs=block_offs;
489  }
490  }
491  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
492  }
493 
494  SG_INFO("found %d strings\n", num_str);
495  SG_DEBUG("block_size=%d\n", required_blocksize);
496  SG_FREE(dummy);
497  blocksize=required_blocksize;
498  dummy=SG_MALLOC(uint8_t, blocksize);
499  overflow=SG_MALLOC(uint8_t, blocksize);
500  strings=SG_MALLOC(SGString<uint8_t>, num_str);
501 
502  rewind(file);
503  sz=blocksize;
504  int32_t lines=0;
505  size_t old_sz=0;
506  while (sz == blocksize)
507  {
508  sz=fread(dummy, sizeof(uint8_t), blocksize, file);
509 
510  old_sz=0;
511  for (size_t i=0; i<sz; i++)
512  {
513  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
514  {
515  int32_t len=i-old_sz;
516  max_string_len=CMath::max(max_string_len, len+overflow_len);
517 
518  strings[lines].slen=len+overflow_len;
519  strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len);
520 
521  for (int32_t j=0; j<overflow_len; j++)
522  strings[lines].string[j]=overflow[j];
523  for (int32_t j=0; j<len; j++)
524  strings[lines].string[j+overflow_len]=dummy[old_sz+j];
525 
526  // clear overflow
527  overflow_len=0;
528 
529  //CMath::display_vector(strings[lines].string, len);
530  old_sz=i+1;
531  lines++;
532  SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
533  }
534  }
535 
536  for (size_t i=old_sz; i<sz; i++)
537  overflow[i-old_sz]=dummy[i];
538 
539  overflow_len=sz-old_sz;
540  }
541  SG_INFO("file successfully read\n");
542  SG_INFO("max_string_length=%d\n", max_string_len);
543  SG_INFO("num_strings=%d\n", num_str);
544  }
545 
546  SG_FREE(dummy);
547  SG_FREE(overflow);
548 }
549 
550 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
551 {
552  size_t blocksize=1024*1024;
553  size_t required_blocksize=0;
554  int8_t* dummy=SG_MALLOC(int8_t, blocksize);
555  int8_t* overflow=NULL;
556  int32_t overflow_len=0;
557 
558  if (file)
559  {
560  num_str=0;
561  max_string_len=0;
562 
563  SG_INFO("counting line numbers in file %s\n", filename);
564  size_t sz=blocksize;
565  size_t block_offs=0;
566  size_t old_block_offs=0;
567  fseek(file, 0, SEEK_END);
568  size_t fsize=ftell(file);
569  rewind(file);
570 
571  while (sz == blocksize)
572  {
573  sz=fread(dummy, sizeof(int8_t), blocksize, file);
574  for (size_t i=0; i<sz; i++)
575  {
576  block_offs++;
577  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
578  {
579  num_str++;
580  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
581  old_block_offs=block_offs;
582  }
583  }
584  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
585  }
586 
587  SG_INFO("found %d strings\n", num_str);
588  SG_DEBUG("block_size=%d\n", required_blocksize);
589  SG_FREE(dummy);
590  blocksize=required_blocksize;
591  dummy=SG_MALLOC(int8_t, blocksize);
592  overflow=SG_MALLOC(int8_t, blocksize);
593  strings=SG_MALLOC(SGString<int8_t>, num_str);
594 
595  rewind(file);
596  sz=blocksize;
597  int32_t lines=0;
598  size_t old_sz=0;
599  while (sz == blocksize)
600  {
601  sz=fread(dummy, sizeof(int8_t), blocksize, file);
602 
603  old_sz=0;
604  for (size_t i=0; i<sz; i++)
605  {
606  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
607  {
608  int32_t len=i-old_sz;
609  max_string_len=CMath::max(max_string_len, len+overflow_len);
610 
611  strings[lines].slen=len+overflow_len;
612  strings[lines].string=SG_MALLOC(int8_t, len+overflow_len);
613 
614  for (int32_t j=0; j<overflow_len; j++)
615  strings[lines].string[j]=overflow[j];
616  for (int32_t j=0; j<len; j++)
617  strings[lines].string[j+overflow_len]=dummy[old_sz+j];
618 
619  // clear overflow
620  overflow_len=0;
621 
622  //CMath::display_vector(strings[lines].string, len);
623  old_sz=i+1;
624  lines++;
625  SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
626  }
627  }
628 
629  for (size_t i=old_sz; i<sz; i++)
630  overflow[i-old_sz]=dummy[i];
631 
632  overflow_len=sz-old_sz;
633  }
634  SG_INFO("file successfully read\n");
635  SG_INFO("max_string_length=%d\n", max_string_len);
636  SG_INFO("num_strings=%d\n", num_str);
637  }
638 
639  SG_FREE(dummy);
640  SG_FREE(overflow);
641 }
642 
643 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
644 {
645  size_t blocksize=1024*1024;
646  size_t required_blocksize=0;
647  char* dummy=SG_MALLOC(char, blocksize);
648  char* overflow=NULL;
649  int32_t overflow_len=0;
650 
651  if (file)
652  {
653  num_str=0;
654  max_string_len=0;
655 
656  SG_INFO("counting line numbers in file %s\n", filename);
657  size_t sz=blocksize;
658  size_t block_offs=0;
659  size_t old_block_offs=0;
660  fseek(file, 0, SEEK_END);
661  size_t fsize=ftell(file);
662  rewind(file);
663 
664  while (sz == blocksize)
665  {
666  sz=fread(dummy, sizeof(char), blocksize, file);
667  for (size_t i=0; i<sz; i++)
668  {
669  block_offs++;
670  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
671  {
672  num_str++;
673  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
674  old_block_offs=block_offs;
675  }
676  }
677  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
678  }
679 
680  SG_INFO("found %d strings\n", num_str);
681  SG_DEBUG("block_size=%d\n", required_blocksize);
682  SG_FREE(dummy);
683  blocksize=required_blocksize;
684  dummy=SG_MALLOC(char, blocksize);
685  overflow=SG_MALLOC(char, blocksize);
686  strings=SG_MALLOC(SGString<char>, num_str);
687 
688  rewind(file);
689  sz=blocksize;
690  int32_t lines=0;
691  size_t old_sz=0;
692  while (sz == blocksize)
693  {
694  sz=fread(dummy, sizeof(char), blocksize, file);
695 
696  old_sz=0;
697  for (size_t i=0; i<sz; i++)
698  {
699  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
700  {
701  int32_t len=i-old_sz;
702  max_string_len=CMath::max(max_string_len, len+overflow_len);
703 
704  strings[lines].slen=len+overflow_len;
705  strings[lines].string=SG_MALLOC(char, len+overflow_len);
706 
707  for (int32_t j=0; j<overflow_len; j++)
708  strings[lines].string[j]=overflow[j];
709  for (int32_t j=0; j<len; j++)
710  strings[lines].string[j+overflow_len]=dummy[old_sz+j];
711 
712  // clear overflow
713  overflow_len=0;
714 
715  //CMath::display_vector(strings[lines].string, len);
716  old_sz=i+1;
717  lines++;
718  SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
719  }
720  }
721 
722  for (size_t i=old_sz; i<sz; i++)
723  overflow[i-old_sz]=dummy[i];
724 
725  overflow_len=sz-old_sz;
726  }
727  SG_INFO("file successfully read\n");
728  SG_INFO("max_string_length=%d\n", max_string_len);
729  SG_INFO("num_strings=%d\n", num_str);
730  }
731 
732  SG_FREE(dummy);
733  SG_FREE(overflow);
734 }
735 
736 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
737 {
738  strings=NULL;
739  num_str=0;
740  max_string_len=0;
741 }
742 
743 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
744 {
745  strings=NULL;
746  num_str=0;
747  max_string_len=0;
748 }
749 
750 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
751 {
752  strings=NULL;
753  num_str=0;
754  max_string_len=0;
755 }
756 
757 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
758 {
759  strings=NULL;
760  num_str=0;
761  max_string_len=0;
762 }
763 
764 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
765 {
766  strings=NULL;
767  num_str=0;
768  max_string_len=0;
769 }
770 
771 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
772 {
773  strings=NULL;
774  num_str=0;
775  max_string_len=0;
776 }
777 
778 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
779 {
780  strings=NULL;
781  num_str=0;
782  max_string_len=0;
783 }
784 
785 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
786 {
787  strings=NULL;
788  num_str=0;
789  max_string_len=0;
790 }
791 
792 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
793 {
794  strings=NULL;
795  num_str=0;
796  max_string_len=0;
797 }
798 
799 
802 #define SET_VECTOR(fname, mfname, sg_type) \
803 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
804 { \
805  mfname(vec, len, 1); \
806 }
807 SET_VECTOR(set_vector, set_matrix, uint8_t)
808 SET_VECTOR(set_vector, set_matrix, char)
809 SET_VECTOR(set_vector, set_matrix, int32_t)
810 SET_VECTOR(set_vector, set_matrix, float32_t)
811 SET_VECTOR(set_vector, set_matrix, float64_t)
812 SET_VECTOR(set_vector, set_matrix, int16_t)
813 SET_VECTOR(set_vector, set_matrix, uint16_t)
814 #undef SET_VECTOR
815 
816 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
817 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
818 { \
819  if (!(file && matrix)) \
820  SG_ERROR("File or matrix invalid.\n"); \
821  \
822  for (int32_t i=0; i<num_vec; i++) \
823  { \
824  for (int32_t j=0; j<num_feat; j++) \
825  { \
826  sg_type v=matrix[num_feat*i+j]; \
827  if (j==num_feat-1) \
828  fprintf(file, type_str "\n", (fprt_type) v); \
829  else \
830  fprintf(file, type_str " ", (fprt_type) v); \
831  } \
832  } \
833 }
834 SET_MATRIX(set_matrix, char, char, "%c")
835 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
836 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
837 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
838 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
839 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
840 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
841 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
842 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
843 SET_MATRIX(set_matrix, float32_t, float32_t, "%f")
844 SET_MATRIX(set_matrix, float64_t, float64_t, "%f")
845 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%Lf")
846 #undef SET_MATRIX
847 
848 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
849 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims) \
850 { \
851  if (!(file && array)) \
852  SG_ERROR("File or data invalid.\n"); \
853  \
854  size_t total = 1; \
855  for(int i = 0;i < num_dims;i++) \
856  total *= dims[i]; \
857  int32_t block_size = dims[num_dims-1]; \
858  \
859  fprintf(file,"%d ",num_dims); \
860  for(int i = 0;i < num_dims;i++) \
861  fprintf(file,"%d ",dims[i]); \
862  fprintf(file,"\n"); \
863  \
864  for (size_t i=0; i < total; i++) \
865  { \
866  sg_type v= array[i]; \
867  if ( ((i+1) % block_size) == 0) \
868  fprintf(file, type_str "\n", (fprt_type) v); \
869  else \
870  fprintf(file, type_str " ", (fprt_type) v); \
871  } \
872 }
873 
874 SET_NDARRAY(set_ndarray, char, char, "%c")
875 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u")
876 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
877 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i")
878 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
879 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli")
880 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu")
881 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i")
882 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u")
883 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
884 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
885 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
886 #undef SET_NDARRAY
887 
888 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
889 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
890 { \
891  if (!(file && matrix)) \
892  SG_ERROR("File or matrix invalid.\n"); \
893  \
894  for (int32_t i=0; i<num_vec; i++) \
895  { \
896  SGSparseVectorEntry<sg_type>* vec = matrix[i].features; \
897  int32_t len=matrix[i].num_feat_entries; \
898  \
899  for (int32_t j=0; j<len; j++) \
900  { \
901  if (j<len-1) \
902  { \
903  fprintf(file, "%d:" type_str " ", \
904  (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
905  } \
906  else \
907  { \
908  fprintf(file, "%d:" type_str "\n", \
909  (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
910  } \
911  } \
912  } \
913 }
914 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u")
915 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c")
916 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u")
917 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
918 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i")
919 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
920 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
921 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
922 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i")
923 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u")
924 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f")
925 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f")
926 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
927 #undef SET_SPARSEMATRIX
928 
929 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str)
930 {
931  if (!(file && strings))
932  SG_ERROR("File or strings invalid.\n");
933 
934  for (int32_t i=0; i<num_str; i++)
935  {
936  int32_t len = strings[i].slen;
937  fwrite(strings[i].string, sizeof(uint8_t), len, file);
938  fprintf(file, "\n");
939  }
940 }
941 
942 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str)
943 {
944  if (!(file && strings))
945  SG_ERROR("File or strings invalid.\n");
946 
947  for (int32_t i=0; i<num_str; i++)
948  {
949  int32_t len = strings[i].slen;
950  fwrite(strings[i].string, sizeof(int8_t), len, file);
951  fprintf(file, "\n");
952  }
953 }
954 
955 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str)
956 {
957  if (!(file && strings))
958  SG_ERROR("File or strings invalid.\n");
959 
960  for (int32_t i=0; i<num_str; i++)
961  {
962  int32_t len = strings[i].slen;
963  fwrite(strings[i].string, sizeof(char), len, file);
964  fprintf(file, "\n");
965  }
966 }
967 
968 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str)
969 {
970 }
971 
972 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str)
973 {
974 }
975 
976 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str)
977 {
978 }
979 
980 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str)
981 {
982 }
983 
984 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str)
985 {
986 }
987 
988 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str)
989 {
990 }
991 
992 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str)
993 {
994 }
995 
996 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str)
997 {
998 }
999 
1000 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str)
1001 {
1002 }
1003 
1004 template <class T> void CAsciiFile::append_item(
1005  DynArray<T>* items, char* ptr_data, char* ptr_item)
1006 {
1007  size_t len=(ptr_data-ptr_item)/sizeof(char);
1008  char* item=SG_MALLOC(char, len+1);
1009  memset(item, 0, sizeof(char)*(len+1));
1010  item=strncpy(item, ptr_item, len);
1011 
1012  SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
1013  items->append_element(item);
1014 }
1015 
1016 #if defined(__MACH__) || defined(FREEBSD)
1017 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
1018 {
1019  int32_t total_bytes_read=0;
1020  int32_t default_size=10;
1021 
1022  if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
1023  return -1;
1024 
1025  if ((*lineptr == NULL) && (*n == 0))
1026  {
1027  *lineptr=SG_MALLOC(char, default_size);
1028  *n=default_size;
1029  }
1030 
1031  int32_t bytes_read, pos=-1;
1032  size_t threshold_size=100000;
1033 
1034  while (1)
1035  {
1036  // We need some limit in case file does not contain '\n'
1037  if (*n > threshold_size)
1038  return -1;
1039 
1040  // Read from file and append to buffer
1041  bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);
1042 
1043  for (int i=0; i<bytes_read; i++)
1044  {
1045  if ((*lineptr)[total_bytes_read+i] == delimiter)
1046  {
1047  pos=i;
1048  break;
1049  }
1050  }
1051 
1052  if (pos==-1)
1053  {
1054  if (feof(stream))
1055  return -1;
1056  total_bytes_read+=bytes_read;
1057  *lineptr=SG_REALLOC(char, *lineptr, (*n)*2);
1058  *n=(*n)*2;
1059  // A better reallocated size should be used
1060  }
1061  else
1062  {
1063  total_bytes_read+=pos+1;
1064  (*lineptr)[total_bytes_read]='\0';
1065  // Seek back to position after \n
1066  fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
1067  return total_bytes_read;
1068  }
1069  }
1070 }
1071 
1072 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
1073 {
1074  return getdelim(lineptr, n, '\n', stream);
1075 }
1076 
1077 #else
1078 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
1079 {
1080  return ::getdelim(lineptr, n, delimiter, stream);
1081 }
1082 
1083 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
1084 {
1085  return ::getline(lineptr, n, stream);
1086 }
1087 #endif
1088 
1090 {
1091  ret.erase();
1092  char *last = s.start;
1093  for (; s.start != s.end; s.start++)
1094  {
1095  if (*s.start == delim)
1096  {
1097  if (s.start != last)
1098  {
1099  substring temp = {last,s.start};
1100  ret.push(temp);
1101  }
1102  last = s.start+1;
1103  }
1104  }
1105  if (s.start != last)
1106  {
1107  substring final = {last, s.start};
1108  ret.push(final);
1109  }
1110 }

SHOGUN Machine Learning Toolbox - Documentation