22 using namespace shogun;
41 #define GET_VECTOR(fname, mfname, sg_type) \
42 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
48 mfname(vec, num_feat, num_vec); \
49 if ((num_feat==1) || (num_vec==1)) \
61 SG_ERROR("Could not read vector from" \
62 " file %s (shape %dx%d found but " \
63 "vector expected).\n", filename, \
77 #define GET_MATRIX(fname, conv, sg_type) \
78 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
81 if (stat(filename, &stats)!=0) \
82 SG_ERROR("Could not get file statistics.\n"); \
84 char* data=SG_MALLOC(char, stats.st_size+1); \
85 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
86 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
88 SG_ERROR("Could not read data from %s.\n", filename); \
90 SG_DEBUG("data read from file:\n%s\n", data); \
96 char* ptr_item=NULL; \
97 char* ptr_data=data; \
98 DynArray<char*>* items=new DynArray<char*>(); \
102 if (*ptr_data=='\n') \
107 if (num_feat!=0 && nf!=num_feat) \
108 SG_ERROR("Number of features mismatches (%d != %d) in vector" \
109 " %d in file %s.\n", num_feat, nf, num_vec, filename); \
111 append_item(items, ptr_data, ptr_item); \
117 else if (!isblank(*ptr_data) && !ptr_item) \
121 else if (isblank(*ptr_data) && ptr_item) \
123 append_item(items, ptr_data, ptr_item); \
131 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec); \
135 matrix=SG_MALLOC(sg_type, num_vec*num_feat); \
136 for (int32_t i=0; i<num_vec; i++) \
138 for (int32_t j=0; j<num_feat; j++) \
140 char* item=items->get_element(i*num_feat+j); \
141 matrix[i*num_feat+j]=conv(item); \
162 #define GET_NDARRAY(fname, conv, sg_type) \
163 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims) \
166 if (stat(filename, &stats)!=0) \
167 SG_ERROR("Could not get file statistics.\n"); \
169 char* data=SG_MALLOC(char, stats.st_size+1); \
170 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
171 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
173 SG_ERROR("Could not read data from %s.\n", filename); \
175 SG_DEBUG("data read from file:\n%s\n", data); \
182 char* ptr_item=NULL; \
183 char* ptr_data=data; \
184 DynArray<char*>* items=new DynArray<char*>(); \
187 while(*ptr_data != '\n') \
189 if(isblank(*ptr_data) && ptr_item) \
191 append_item(items, ptr_data, ptr_item); \
195 else if(!isblank(*ptr_data) && !ptr_item) \
196 ptr_item = ptr_data; \
206 if (*ptr_data=='\n') \
211 if (length!=0 && counter!=length) \
212 SG_ERROR("Invalid number of data (%d != %d) in line" \
213 " %d in file %s.\n", length, counter, total, filename); \
215 append_item(items, ptr_data, ptr_item); \
221 else if (!isblank(*ptr_data) && !ptr_item) \
225 else if (isblank(*ptr_data) && ptr_item) \
227 append_item(items, ptr_data, ptr_item); \
235 SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total); \
240 item=items->get_element(0); \
241 if(atoi(item) != num_dims) \
242 SG_ERROR("Invalid number of dimensions!\n"); \
244 dims = SG_MALLOC(int32_t, num_dims); \
245 for(int32_t i =0;i < num_dims;i++) \
247 item = items->get_element(i+1); \
248 dims[i] = atoi(item); \
251 if (dims[num_dims-1] != length) \
252 SG_ERROR("Invalid number of lines in file!\n"); \
256 array=SG_MALLOC(sg_type, total); \
257 for (size_t i=0; i<total; i++) \
259 item=items->get_element(i+(num_dims+1)); \
260 array[i]=conv(item); \
280 #define GET_SPARSEMATRIX(fname, conv, sg_type) \
281 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
283 size_t blocksize=1024*1024; \
284 size_t required_blocksize=blocksize; \
285 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); \
292 SG_INFO("counting line numbers in file %s\n", filename); \
293 size_t sz=blocksize; \
294 size_t block_offs=0; \
295 size_t old_block_offs=0; \
296 fseek(file, 0, SEEK_END); \
297 size_t fsize=ftell(file); \
300 while (sz == blocksize) \
302 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
303 for (size_t i=0; i<sz; i++) \
306 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
309 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
310 old_block_offs=block_offs; \
313 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); \
316 SG_INFO("found %d feature vectors\n", num_vec); \
318 blocksize=required_blocksize; \
319 dummy = SG_MALLOC(uint8_t, blocksize+1); \
320 matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
321 for (int i=0; i<num_vec; i++) \
322 new (&matrix[i]) SGSparseVector<sg_type>(); \
326 while (sz == blocksize) \
328 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
331 for (size_t i=0; i<sz; i++) \
333 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
335 size_t len=i-old_sz+1; \
336 uint8_t* data=&dummy[old_sz]; \
338 for (size_t j=0; j<len; j++) \
341 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file); \
347 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
350 size_t len=i-old_sz; \
351 uint8_t* data=&dummy[old_sz]; \
354 for (size_t j=0; j<len; j++) \
362 SG_ERROR("Error in line %d - number of" \
363 " dimensions is %d line is %d characters" \
364 " long\n line_content:'%.*s'\n", lines, \
365 dims, len, len, (const char*) data); \
368 SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims); \
391 uint8_t* start=&data[j]; \
398 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; \
399 num_feat=CMath::max(num_feat, feat[d].feat_index+1); \
405 if (data[j]==' ' || data[j]=='\n') \
408 feat[d].entry=(sg_type) conv((const char*) start); \
417 feat[dims-1].entry=(sg_type) conv((const char*) start); \
425 matrix[lines].num_feat_entries=dims; \
426 matrix[lines].features=feat; \
430 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t"); \
435 SG_INFO("file successfully read\n"); \
454 #undef GET_SPARSEMATRIX
459 size_t blocksize=1024*1024;
460 size_t required_blocksize=0;
461 uint8_t* dummy=
SG_MALLOC(uint8_t, blocksize);
462 uint8_t* overflow=NULL;
463 int32_t overflow_len=0;
473 size_t old_block_offs=0;
474 fseek(
file, 0, SEEK_END);
475 size_t fsize=ftell(
file);
478 while (sz == blocksize)
480 sz=fread(dummy,
sizeof(uint8_t), blocksize,
file);
481 for (
size_t i=0; i<sz; i++)
484 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
487 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
488 old_block_offs=block_offs;
491 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
494 SG_INFO(
"found %d strings\n", num_str);
495 SG_DEBUG(
"block_size=%d\n", required_blocksize);
497 blocksize=required_blocksize;
506 while (sz == blocksize)
508 sz=fread(dummy,
sizeof(uint8_t), blocksize,
file);
511 for (
size_t i=0; i<sz; i++)
513 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
515 int32_t len=i-old_sz;
516 max_string_len=
CMath::max(max_string_len, len+overflow_len);
518 strings[lines].
slen=len+overflow_len;
521 for (int32_t j=0; j<overflow_len; j++)
522 strings[lines].
string[j]=overflow[j];
523 for (int32_t j=0; j<len; j++)
524 strings[lines].
string[j+overflow_len]=dummy[old_sz+j];
536 for (
size_t i=old_sz; i<sz; i++)
537 overflow[i-old_sz]=dummy[i];
539 overflow_len=sz-old_sz;
541 SG_INFO(
"file successfully read\n");
542 SG_INFO(
"max_string_length=%d\n", max_string_len);
543 SG_INFO(
"num_strings=%d\n", num_str);
552 size_t blocksize=1024*1024;
553 size_t required_blocksize=0;
554 int8_t* dummy=
SG_MALLOC(int8_t, blocksize);
555 int8_t* overflow=NULL;
556 int32_t overflow_len=0;
566 size_t old_block_offs=0;
567 fseek(
file, 0, SEEK_END);
568 size_t fsize=ftell(
file);
571 while (sz == blocksize)
573 sz=fread(dummy,
sizeof(int8_t), blocksize,
file);
574 for (
size_t i=0; i<sz; i++)
577 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
580 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
581 old_block_offs=block_offs;
584 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
587 SG_INFO(
"found %d strings\n", num_str);
588 SG_DEBUG(
"block_size=%d\n", required_blocksize);
590 blocksize=required_blocksize;
599 while (sz == blocksize)
601 sz=fread(dummy,
sizeof(int8_t), blocksize,
file);
604 for (
size_t i=0; i<sz; i++)
606 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
608 int32_t len=i-old_sz;
609 max_string_len=
CMath::max(max_string_len, len+overflow_len);
611 strings[lines].
slen=len+overflow_len;
614 for (int32_t j=0; j<overflow_len; j++)
615 strings[lines].
string[j]=overflow[j];
616 for (int32_t j=0; j<len; j++)
617 strings[lines].
string[j+overflow_len]=dummy[old_sz+j];
629 for (
size_t i=old_sz; i<sz; i++)
630 overflow[i-old_sz]=dummy[i];
632 overflow_len=sz-old_sz;
634 SG_INFO(
"file successfully read\n");
635 SG_INFO(
"max_string_length=%d\n", max_string_len);
636 SG_INFO(
"num_strings=%d\n", num_str);
645 size_t blocksize=1024*1024;
646 size_t required_blocksize=0;
649 int32_t overflow_len=0;
659 size_t old_block_offs=0;
660 fseek(
file, 0, SEEK_END);
661 size_t fsize=ftell(
file);
664 while (sz == blocksize)
666 sz=fread(dummy,
sizeof(
char), blocksize,
file);
667 for (
size_t i=0; i<sz; i++)
670 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
673 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
674 old_block_offs=block_offs;
677 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
680 SG_INFO(
"found %d strings\n", num_str);
681 SG_DEBUG(
"block_size=%d\n", required_blocksize);
683 blocksize=required_blocksize;
692 while (sz == blocksize)
694 sz=fread(dummy,
sizeof(
char), blocksize,
file);
697 for (
size_t i=0; i<sz; i++)
699 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
701 int32_t len=i-old_sz;
702 max_string_len=
CMath::max(max_string_len, len+overflow_len);
704 strings[lines].
slen=len+overflow_len;
707 for (int32_t j=0; j<overflow_len; j++)
708 strings[lines].
string[j]=overflow[j];
709 for (int32_t j=0; j<len; j++)
710 strings[lines].
string[j+overflow_len]=dummy[old_sz+j];
722 for (
size_t i=old_sz; i<sz; i++)
723 overflow[i-old_sz]=dummy[i];
725 overflow_len=sz-old_sz;
727 SG_INFO(
"file successfully read\n");
728 SG_INFO(
"max_string_length=%d\n", max_string_len);
729 SG_INFO(
"num_strings=%d\n", num_str);
802 #define SET_VECTOR(fname, mfname, sg_type) \
803 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
805 mfname(vec, len, 1); \
816 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
817 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
819 if (!(file && matrix)) \
820 SG_ERROR("File or matrix invalid.\n"); \
822 for (int32_t i=0; i<num_vec; i++) \
824 for (int32_t j=0; j<num_feat; j++) \
826 sg_type v=matrix[num_feat*i+j]; \
828 fprintf(file, type_str "\n", (fprt_type) v); \
830 fprintf(file, type_str " ", (fprt_type) v); \
835 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
836 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
837 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
838 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
839 SET_MATRIX(set_long_matrix, int64_t,
long long int, "%lli")
840 SET_MATRIX(set_ulong_matrix, uint64_t,
long long unsigned int, "%llu")
841 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
842 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
848 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
849 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims) \
851 if (!(file && array)) \
852 SG_ERROR("File or data invalid.\n"); \
855 for(int i = 0;i < num_dims;i++) \
857 int32_t block_size = dims[num_dims-1]; \
859 fprintf(file,"%d ",num_dims); \
860 for(int i = 0;i < num_dims;i++) \
861 fprintf(file,"%d ",dims[i]); \
862 fprintf(file,"\n"); \
864 for (size_t i=0; i < total; i++) \
866 sg_type v= array[i]; \
867 if ( ((i+1) % block_size) == 0) \
868 fprintf(file, type_str "\n", (fprt_type) v); \
870 fprintf(file, type_str " ", (fprt_type) v); \
876 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
878 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
879 SET_NDARRAY(set_long_ndarray, int64_t,
long long int, "%lli")
880 SET_NDARRAY(set_ulong_ndarray, uint64_t,
long long unsigned int, "%llu")
883 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
884 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
885 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
888 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
889 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
891 if (!(file && matrix)) \
892 SG_ERROR("File or matrix invalid.\n"); \
894 for (int32_t i=0; i<num_vec; i++) \
896 SGSparseVectorEntry<sg_type>* vec = matrix[i].features; \
897 int32_t len=matrix[i].num_feat_entries; \
899 for (int32_t j=0; j<len; j++) \
903 fprintf(file, "%d:" type_str " ", \
904 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
908 fprintf(file, "%d:" type_str "\n", \
909 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
921 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t,
long long unsigned int, "%llu")
927 #undef SET_SPARSEMATRIX
931 if (!(
file && strings))
932 SG_ERROR(
"File or strings invalid.\n");
934 for (int32_t i=0; i<num_str; i++)
936 int32_t len = strings[i].
slen;
937 fwrite(strings[i].
string,
sizeof(uint8_t), len,
file);
944 if (!(
file && strings))
945 SG_ERROR(
"File or strings invalid.\n");
947 for (int32_t i=0; i<num_str; i++)
949 int32_t len = strings[i].
slen;
950 fwrite(strings[i].
string,
sizeof(int8_t), len,
file);
957 if (!(
file && strings))
958 SG_ERROR(
"File or strings invalid.\n");
960 for (int32_t i=0; i<num_str; i++)
962 int32_t len = strings[i].
slen;
963 fwrite(strings[i].
string,
sizeof(
char), len,
file);
1004 template <
class T>
void CAsciiFile::append_item(
1005 DynArray<T>* items,
char* ptr_data,
char* ptr_item)
1007 size_t len=(ptr_data-ptr_item)/
sizeof(
char);
1009 memset(item, 0,
sizeof(
char)*(len+1));
1010 item=strncpy(item, ptr_item, len);
1012 SG_DEBUG(
"current %c, len %d, item %s\n", *ptr_data, len, item);
1016 #if defined(__MACH__) || defined(FREEBSD)
1019 int32_t total_bytes_read=0;
1020 int32_t default_size=10;
1022 if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
1025 if ((*lineptr == NULL) && (*n == 0))
1031 int32_t bytes_read, pos=-1;
1032 size_t threshold_size=100000;
1037 if (*n > threshold_size)
1041 bytes_read=fread(*lineptr+total_bytes_read,
sizeof(
char), *n-total_bytes_read, stream);
1043 for (
int i=0; i<bytes_read; i++)
1045 if ((*lineptr)[total_bytes_read+i] == delimiter)
1056 total_bytes_read+=bytes_read;
1063 total_bytes_read+=pos+1;
1064 (*lineptr)[total_bytes_read]=
'\0';
1066 fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
1067 return total_bytes_read;
1074 return getdelim(lineptr, n,
'\n', stream);
1092 char *last = s.
start;
1095 if (*s.
start == delim)
1097 if (s.
start != last)
1105 if (s.
start != last)