00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include <shogun/features/SparseFeatures.h>
00016 #include <shogun/io/File.h>
00017 #include <shogun/io/AsciiFile.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <ctype.h>
00020 #include <stdio.h>
00021
00022 using namespace shogun;
00023
00024 CAsciiFile::CAsciiFile()
00025 {
00026 SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n");
00027 }
00028
00029 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
00030 {
00031 }
00032
00033 CAsciiFile::CAsciiFile(const char* fname, char rw, const char* name) : CFile(fname, rw, name)
00034 {
00035 }
00036
00037 CAsciiFile::~CAsciiFile()
00038 {
00039 }
00040
00041 #define GET_VECTOR(fname, mfname, sg_type) \
00042 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
00043 { \
00044 vec=NULL; \
00045 len=0; \
00046 int32_t num_feat=0; \
00047 int32_t num_vec=0; \
00048 mfname(vec, num_feat, num_vec); \
00049 if ((num_feat==1) || (num_vec==1)) \
00050 { \
00051 if (num_feat==1) \
00052 len=num_vec; \
00053 else \
00054 len=num_feat; \
00055 } \
00056 else \
00057 { \
00058 SG_FREE(vec); \
00059 vec=NULL; \
00060 len=0; \
00061 SG_ERROR("Could not read vector from" \
00062 " file %s (shape %dx%d found but " \
00063 "vector expected).\n", filename, \
00064 num_vec, num_feat); \
00065 } \
00066 }
00067
00068 GET_VECTOR(get_vector, get_int8_matrix, int8_t)
00069 GET_VECTOR(get_vector, get_matrix, uint8_t)
00070 GET_VECTOR(get_vector, get_matrix, char)
00071 GET_VECTOR(get_vector, get_matrix, int32_t)
00072 GET_VECTOR(get_vector, get_uint_matrix, uint32_t)
00073 GET_VECTOR(get_vector, get_matrix, float32_t)
00074 GET_VECTOR(get_vector, get_matrix, float64_t)
00075 GET_VECTOR(get_vector, get_longreal_matrix, floatmax_t)
00076 GET_VECTOR(get_vector, get_matrix, int16_t)
00077 GET_VECTOR(get_vector, get_matrix, uint16_t)
00078 GET_VECTOR(get_vector, get_long_matrix, int64_t)
00079 GET_VECTOR(get_vector, get_ulong_matrix, uint64_t)
00080 #undef GET_VECTOR
00081
00082 #define GET_MATRIX(fname, conv, sg_type) \
00083 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
00084 { \
00085 struct stat stats; \
00086 if (stat(filename, &stats)!=0) \
00087 SG_ERROR("Could not get file statistics.\n"); \
00088 \
00089 char* data=SG_MALLOC(char, stats.st_size+1); \
00090 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
00091 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
00092 if (nread<=0) \
00093 SG_ERROR("Could not read data from %s.\n", filename); \
00094 \
00095 SG_DEBUG("data read from file:\n%s\n", data); \
00096 \
00097 \
00098 int32_t nf=0; \
00099 num_feat=0; \
00100 num_vec=0; \
00101 char* ptr_item=NULL; \
00102 char* ptr_data=data; \
00103 DynArray<char*>* items=new DynArray<char*>(); \
00104 \
00105 while (*ptr_data) \
00106 { \
00107 if (*ptr_data=='\n') \
00108 { \
00109 if (ptr_item) \
00110 nf++; \
00111 \
00112 if (num_feat!=0 && nf!=num_feat) \
00113 SG_ERROR("Number of features mismatches (%d != %d) in vector" \
00114 " %d in file %s.\n", num_feat, nf, num_vec, filename); \
00115 \
00116 append_item(items, ptr_data, ptr_item); \
00117 num_feat=nf; \
00118 num_vec++; \
00119 nf=0; \
00120 ptr_item=NULL; \
00121 } \
00122 else if (!isblank(*ptr_data) && !ptr_item) \
00123 { \
00124 ptr_item=ptr_data; \
00125 } \
00126 else if (isblank(*ptr_data) && ptr_item) \
00127 { \
00128 append_item(items, ptr_data, ptr_item); \
00129 ptr_item=NULL; \
00130 nf++; \
00131 } \
00132 \
00133 ptr_data++; \
00134 } \
00135 \
00136 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec); \
00137 SG_FREE(data); \
00138 \
00139 \
00140 matrix=SG_MALLOC(sg_type, num_vec*num_feat); \
00141 for (int32_t i=0; i<num_vec; i++) \
00142 { \
00143 for (int32_t j=0; j<num_feat; j++) \
00144 { \
00145 char* item=items->get_element(i*num_feat+j); \
00146 matrix[i*num_feat+j]=conv(item); \
00147 SG_FREE(item); \
00148 } \
00149 } \
00150 delete items; \
00151 }
00152
00153 GET_MATRIX(get_matrix, atoi, uint8_t)
00154 GET_MATRIX(get_int8_matrix, atoi, int8_t)
00155 GET_MATRIX(get_matrix, atoi, char)
00156 GET_MATRIX(get_matrix, atoi, int32_t)
00157 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
00158 GET_MATRIX(get_long_matrix, atoll, int64_t)
00159 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
00160 GET_MATRIX(get_matrix, atof, float32_t)
00161 GET_MATRIX(get_matrix, atof, float64_t)
00162 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
00163 GET_MATRIX(get_matrix, atoi, int16_t)
00164 GET_MATRIX(get_matrix, atoi, uint16_t)
00165 #undef GET_MATRIX
00166
00167 #define GET_NDARRAY(fname, conv, sg_type) \
00168 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims) \
00169 { \
00170 struct stat stats; \
00171 if (stat(filename, &stats)!=0) \
00172 SG_ERROR("Could not get file statistics.\n"); \
00173 \
00174 char* data=SG_MALLOC(char, stats.st_size+1); \
00175 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
00176 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
00177 if (nread<=0) \
00178 SG_ERROR("Could not read data from %s.\n", filename); \
00179 \
00180 SG_DEBUG("data read from file:\n%s\n", data); \
00181 \
00182 \
00183 int32_t length=0; \
00184 int32_t counter=0; \
00185 size_t total=0; \
00186 num_dims = -1; \
00187 char* ptr_item=NULL; \
00188 char* ptr_data=data; \
00189 DynArray<char*>* items=new DynArray<char*>(); \
00190 \
00191 \
00192 while(*ptr_data != '\n') \
00193 { \
00194 if(isblank(*ptr_data) && ptr_item) \
00195 { \
00196 append_item(items, ptr_data, ptr_item); \
00197 num_dims++; \
00198 ptr_item = NULL; \
00199 } \
00200 else if(!isblank(*ptr_data) && !ptr_item) \
00201 ptr_item = ptr_data; \
00202 \
00203 ptr_data++; \
00204 } \
00205 ptr_item = NULL; \
00206 ptr_data++; \
00207 \
00208 \
00209 while(*ptr_data) \
00210 { \
00211 if (*ptr_data=='\n') \
00212 { \
00213 if (ptr_item) \
00214 counter++; \
00215 \
00216 if (length!=0 && counter!=length) \
00217 SG_ERROR("Invalid number of data (%d != %d) in line" \
00218 " %d in file %s.\n", length, counter, total, filename); \
00219 \
00220 append_item(items, ptr_data, ptr_item); \
00221 length=counter; \
00222 total++; \
00223 counter=0; \
00224 ptr_item=NULL; \
00225 } \
00226 else if (!isblank(*ptr_data) && !ptr_item) \
00227 { \
00228 ptr_item=ptr_data; \
00229 } \
00230 else if (isblank(*ptr_data) && ptr_item) \
00231 { \
00232 append_item(items, ptr_data, ptr_item); \
00233 ptr_item=NULL; \
00234 counter++; \
00235 } \
00236 \
00237 ptr_data++; \
00238 } \
00239 \
00240 SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total); \
00241 SG_FREE(data); \
00242 \
00243 \
00244 char * item; \
00245 item=items->get_element(0); \
00246 if(atoi(item) != num_dims) \
00247 SG_ERROR("Invalid number of dimensions!\n"); \
00248 SG_FREE(item); \
00249 dims = SG_MALLOC(int32_t, num_dims); \
00250 for(int32_t i =0;i < num_dims;i++) \
00251 { \
00252 item = items->get_element(i+1); \
00253 dims[i] = atoi(item); \
00254 SG_FREE(item); \
00255 } \
00256 if (dims[num_dims-1] != length) \
00257 SG_ERROR("Invalid number of lines in file!\n"); \
00258 \
00259 \
00260 total *= length; \
00261 array=SG_MALLOC(sg_type, total); \
00262 for (size_t i=0; i<total; i++) \
00263 { \
00264 item=items->get_element(i+(num_dims+1)); \
00265 array[i]=conv(item); \
00266 SG_FREE(item); \
00267 } \
00268 delete items; \
00269 }
00270
00271 GET_NDARRAY(get_ndarray, atoi, uint8_t)
00272 GET_NDARRAY(get_int8_ndarray, atoi, int8_t)
00273 GET_NDARRAY(get_ndarray, atoi, char)
00274 GET_NDARRAY(get_ndarray, atoi, int32_t)
00275 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t)
00276 GET_NDARRAY(get_long_ndarray, atoll, int64_t)
00277 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t)
00278 GET_NDARRAY(get_ndarray, atof, float32_t)
00279 GET_NDARRAY(get_ndarray, atof, float64_t)
00280 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t)
00281 GET_NDARRAY(get_ndarray, atoi, int16_t)
00282 GET_NDARRAY(get_ndarray, atoi, uint16_t)
00283 #undef GET_NDARRAY
00284
00285 #define GET_SPARSEMATRIX(fname, conv, sg_type) \
00286 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00287 { \
00288 size_t blocksize=1024*1024; \
00289 size_t required_blocksize=blocksize; \
00290 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); \
00291 \
00292 if (file) \
00293 { \
00294 num_vec=0; \
00295 num_feat=0; \
00296 \
00297 SG_INFO("counting line numbers in file %s\n", filename); \
00298 size_t sz=blocksize; \
00299 size_t block_offs=0; \
00300 size_t old_block_offs=0; \
00301 fseek(file, 0, SEEK_END); \
00302 size_t fsize=ftell(file); \
00303 rewind(file); \
00304 \
00305 while (sz == blocksize) \
00306 { \
00307 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
00308 for (size_t i=0; i<sz; i++) \
00309 { \
00310 block_offs++; \
00311 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
00312 { \
00313 num_vec++; \
00314 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
00315 old_block_offs=block_offs; \
00316 } \
00317 } \
00318 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); \
00319 } \
00320 \
00321 SG_INFO("found %d feature vectors\n", num_vec); \
00322 SG_FREE(dummy); \
00323 blocksize=required_blocksize; \
00324 dummy = SG_MALLOC(uint8_t, blocksize+1); \
00325 matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
00326 for (int i=0; i<num_vec; i++) \
00327 new (&matrix[i]) SGSparseVector<sg_type>(); \
00328 rewind(file); \
00329 sz=blocksize; \
00330 int32_t lines=0; \
00331 while (sz == blocksize) \
00332 { \
00333 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
00334 \
00335 size_t old_sz=0; \
00336 for (size_t i=0; i<sz; i++) \
00337 { \
00338 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
00339 { \
00340 size_t len=i-old_sz+1; \
00341 uint8_t* data=&dummy[old_sz]; \
00342 \
00343 for (size_t j=0; j<len; j++) \
00344 dummy[j]=data[j]; \
00345 \
00346 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file); \
00347 i=0; \
00348 old_sz=0; \
00349 sz+=len; \
00350 } \
00351 \
00352 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
00353 { \
00354 \
00355 size_t len=i-old_sz; \
00356 uint8_t* data=&dummy[old_sz]; \
00357 \
00358 int32_t dims=0; \
00359 for (size_t j=0; j<len; j++) \
00360 { \
00361 if (data[j]==':') \
00362 dims++; \
00363 } \
00364 \
00365 if (dims<=0) \
00366 { \
00367 SG_ERROR("Error in line %d - number of" \
00368 " dimensions is %d line is %d characters" \
00369 " long\n line_content:'%.*s'\n", lines, \
00370 dims, len, len, (const char*) data); \
00371 } \
00372 \
00373 SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims); \
00374 \
00375 \
00376 size_t j=0; \
00377 for (; j<len; j++) \
00378 { \
00379 if (data[j]==':') \
00380 { \
00381 j=-1; \
00382 break; \
00383 } \
00384 \
00385 if (data[j]==' ') \
00386 { \
00387 data[j]='\0'; \
00388 \
00389 \
00390 break; \
00391 } \
00392 } \
00393 \
00394 int32_t d=0; \
00395 j++; \
00396 uint8_t* start=&data[j]; \
00397 for (; j<len; j++) \
00398 { \
00399 if (data[j]==':') \
00400 { \
00401 data[j]='\0'; \
00402 \
00403 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; \
00404 num_feat=CMath::max(num_feat, feat[d].feat_index+1); \
00405 \
00406 j++; \
00407 start=&data[j]; \
00408 for (; j<len; j++) \
00409 { \
00410 if (data[j]==' ' || data[j]=='\n') \
00411 { \
00412 data[j]='\0'; \
00413 feat[d].entry=(sg_type) conv((const char*) start); \
00414 d++; \
00415 break; \
00416 } \
00417 } \
00418 \
00419 if (j==len) \
00420 { \
00421 data[j]='\0'; \
00422 feat[dims-1].entry=(sg_type) conv((const char*) start); \
00423 } \
00424 \
00425 j++; \
00426 start=&data[j]; \
00427 } \
00428 } \
00429 \
00430 matrix[lines].num_feat_entries=dims; \
00431 matrix[lines].features=feat; \
00432 \
00433 old_sz=i+1; \
00434 lines++; \
00435 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t"); \
00436 } \
00437 } \
00438 } \
00439 \
00440 SG_INFO("file successfully read\n"); \
00441 } \
00442 \
00443 SG_FREE(dummy); \
00444 }
00445
00446 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool)
00447 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t)
00448 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
00449 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char)
00450 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t)
00451 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
00452 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
00453 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
00454 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t)
00455 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t)
00456 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
00457 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t)
00458 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t)
00459 #undef GET_SPARSEMATRIX
00460
00461
00462 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00463 {
00464 size_t blocksize=1024*1024;
00465 size_t required_blocksize=0;
00466 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00467 uint8_t* overflow=NULL;
00468 int32_t overflow_len=0;
00469
00470 if (file)
00471 {
00472 num_str=0;
00473 max_string_len=0;
00474
00475 SG_INFO("counting line numbers in file %s\n", filename);
00476 size_t sz=blocksize;
00477 size_t block_offs=0;
00478 size_t old_block_offs=0;
00479 fseek(file, 0, SEEK_END);
00480 size_t fsize=ftell(file);
00481 rewind(file);
00482
00483 while (sz == blocksize)
00484 {
00485 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00486 for (size_t i=0; i<sz; i++)
00487 {
00488 block_offs++;
00489 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00490 {
00491 num_str++;
00492 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00493 old_block_offs=block_offs;
00494 }
00495 }
00496 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00497 }
00498
00499 SG_INFO("found %d strings\n", num_str);
00500 SG_DEBUG("block_size=%d\n", required_blocksize);
00501 SG_FREE(dummy);
00502 blocksize=required_blocksize;
00503 dummy=SG_MALLOC(uint8_t, blocksize);
00504 overflow=SG_MALLOC(uint8_t, blocksize);
00505 strings=SG_MALLOC(SGString<uint8_t>, num_str);
00506
00507 rewind(file);
00508 sz=blocksize;
00509 int32_t lines=0;
00510 size_t old_sz=0;
00511 while (sz == blocksize)
00512 {
00513 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00514
00515 old_sz=0;
00516 for (size_t i=0; i<sz; i++)
00517 {
00518 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00519 {
00520 int32_t len=i-old_sz;
00521 max_string_len=CMath::max(max_string_len, len+overflow_len);
00522
00523 strings[lines].slen=len+overflow_len;
00524 strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len);
00525
00526 for (int32_t j=0; j<overflow_len; j++)
00527 strings[lines].string[j]=overflow[j];
00528 for (int32_t j=0; j<len; j++)
00529 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00530
00531
00532 overflow_len=0;
00533
00534
00535 old_sz=i+1;
00536 lines++;
00537 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00538 }
00539 }
00540
00541 for (size_t i=old_sz; i<sz; i++)
00542 overflow[i-old_sz]=dummy[i];
00543
00544 overflow_len=sz-old_sz;
00545 }
00546 SG_INFO("file successfully read\n");
00547 SG_INFO("max_string_length=%d\n", max_string_len);
00548 SG_INFO("num_strings=%d\n", num_str);
00549 }
00550
00551 SG_FREE(dummy);
00552 SG_FREE(overflow);
00553 }
00554
00555 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00556 {
00557 size_t blocksize=1024*1024;
00558 size_t required_blocksize=0;
00559 int8_t* dummy=SG_MALLOC(int8_t, blocksize);
00560 int8_t* overflow=NULL;
00561 int32_t overflow_len=0;
00562
00563 if (file)
00564 {
00565 num_str=0;
00566 max_string_len=0;
00567
00568 SG_INFO("counting line numbers in file %s\n", filename);
00569 size_t sz=blocksize;
00570 size_t block_offs=0;
00571 size_t old_block_offs=0;
00572 fseek(file, 0, SEEK_END);
00573 size_t fsize=ftell(file);
00574 rewind(file);
00575
00576 while (sz == blocksize)
00577 {
00578 sz=fread(dummy, sizeof(int8_t), blocksize, file);
00579 for (size_t i=0; i<sz; i++)
00580 {
00581 block_offs++;
00582 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00583 {
00584 num_str++;
00585 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00586 old_block_offs=block_offs;
00587 }
00588 }
00589 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00590 }
00591
00592 SG_INFO("found %d strings\n", num_str);
00593 SG_DEBUG("block_size=%d\n", required_blocksize);
00594 SG_FREE(dummy);
00595 blocksize=required_blocksize;
00596 dummy=SG_MALLOC(int8_t, blocksize);
00597 overflow=SG_MALLOC(int8_t, blocksize);
00598 strings=SG_MALLOC(SGString<int8_t>, num_str);
00599
00600 rewind(file);
00601 sz=blocksize;
00602 int32_t lines=0;
00603 size_t old_sz=0;
00604 while (sz == blocksize)
00605 {
00606 sz=fread(dummy, sizeof(int8_t), blocksize, file);
00607
00608 old_sz=0;
00609 for (size_t i=0; i<sz; i++)
00610 {
00611 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00612 {
00613 int32_t len=i-old_sz;
00614 max_string_len=CMath::max(max_string_len, len+overflow_len);
00615
00616 strings[lines].slen=len+overflow_len;
00617 strings[lines].string=SG_MALLOC(int8_t, len+overflow_len);
00618
00619 for (int32_t j=0; j<overflow_len; j++)
00620 strings[lines].string[j]=overflow[j];
00621 for (int32_t j=0; j<len; j++)
00622 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00623
00624
00625 overflow_len=0;
00626
00627
00628 old_sz=i+1;
00629 lines++;
00630 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00631 }
00632 }
00633
00634 for (size_t i=old_sz; i<sz; i++)
00635 overflow[i-old_sz]=dummy[i];
00636
00637 overflow_len=sz-old_sz;
00638 }
00639 SG_INFO("file successfully read\n");
00640 SG_INFO("max_string_length=%d\n", max_string_len);
00641 SG_INFO("num_strings=%d\n", num_str);
00642 }
00643
00644 SG_FREE(dummy);
00645 SG_FREE(overflow);
00646 }
00647
00648 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00649 {
00650 size_t blocksize=1024*1024;
00651 size_t required_blocksize=0;
00652 char* dummy=SG_MALLOC(char, blocksize);
00653 char* overflow=NULL;
00654 int32_t overflow_len=0;
00655
00656 if (file)
00657 {
00658 num_str=0;
00659 max_string_len=0;
00660
00661 SG_INFO("counting line numbers in file %s\n", filename);
00662 size_t sz=blocksize;
00663 size_t block_offs=0;
00664 size_t old_block_offs=0;
00665 fseek(file, 0, SEEK_END);
00666 size_t fsize=ftell(file);
00667 rewind(file);
00668
00669 while (sz == blocksize)
00670 {
00671 sz=fread(dummy, sizeof(char), blocksize, file);
00672 for (size_t i=0; i<sz; i++)
00673 {
00674 block_offs++;
00675 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00676 {
00677 num_str++;
00678 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00679 old_block_offs=block_offs;
00680 }
00681 }
00682 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00683 }
00684
00685 SG_INFO("found %d strings\n", num_str);
00686 SG_DEBUG("block_size=%d\n", required_blocksize);
00687 SG_FREE(dummy);
00688 blocksize=required_blocksize;
00689 dummy=SG_MALLOC(char, blocksize);
00690 overflow=SG_MALLOC(char, blocksize);
00691 strings=SG_MALLOC(SGString<char>, num_str);
00692
00693 rewind(file);
00694 sz=blocksize;
00695 int32_t lines=0;
00696 size_t old_sz=0;
00697 while (sz == blocksize)
00698 {
00699 sz=fread(dummy, sizeof(char), blocksize, file);
00700
00701 old_sz=0;
00702 for (size_t i=0; i<sz; i++)
00703 {
00704 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00705 {
00706 int32_t len=i-old_sz;
00707 max_string_len=CMath::max(max_string_len, len+overflow_len);
00708
00709 strings[lines].slen=len+overflow_len;
00710 strings[lines].string=SG_MALLOC(char, len+overflow_len);
00711
00712 for (int32_t j=0; j<overflow_len; j++)
00713 strings[lines].string[j]=overflow[j];
00714 for (int32_t j=0; j<len; j++)
00715 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00716
00717
00718 overflow_len=0;
00719
00720
00721 old_sz=i+1;
00722 lines++;
00723 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00724 }
00725 }
00726
00727 for (size_t i=old_sz; i<sz; i++)
00728 overflow[i-old_sz]=dummy[i];
00729
00730 overflow_len=sz-old_sz;
00731 }
00732 SG_INFO("file successfully read\n");
00733 SG_INFO("max_string_length=%d\n", max_string_len);
00734 SG_INFO("num_strings=%d\n", num_str);
00735 }
00736
00737 SG_FREE(dummy);
00738 SG_FREE(overflow);
00739 }
00740
00741 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00742 {
00743 strings=NULL;
00744 num_str=0;
00745 max_string_len=0;
00746 }
00747
00748 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00749 {
00750 strings=NULL;
00751 num_str=0;
00752 max_string_len=0;
00753 }
00754
00755 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00756 {
00757 strings=NULL;
00758 num_str=0;
00759 max_string_len=0;
00760 }
00761
00762 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00763 {
00764 strings=NULL;
00765 num_str=0;
00766 max_string_len=0;
00767 }
00768
00769 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00770 {
00771 strings=NULL;
00772 num_str=0;
00773 max_string_len=0;
00774 }
00775
00776 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00777 {
00778 strings=NULL;
00779 num_str=0;
00780 max_string_len=0;
00781 }
00782
00783 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00784 {
00785 strings=NULL;
00786 num_str=0;
00787 max_string_len=0;
00788 }
00789
00790 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00791 {
00792 strings=NULL;
00793 num_str=0;
00794 max_string_len=0;
00795 }
00796
00797 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00798 {
00799 strings=NULL;
00800 num_str=0;
00801 max_string_len=0;
00802 }
00803
00804
00807 #define SET_VECTOR(fname, mfname, sg_type) \
00808 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
00809 { \
00810 mfname(vec, len, 1); \
00811 }
00812 SET_VECTOR(set_vector, set_int8_matrix, int8_t)
00813 SET_VECTOR(set_vector, set_matrix, uint8_t)
00814 SET_VECTOR(set_vector, set_matrix, char)
00815 SET_VECTOR(set_vector, set_matrix, int32_t)
00816 SET_VECTOR(set_vector, set_uint_matrix, uint32_t)
00817 SET_VECTOR(set_vector, set_matrix, float32_t)
00818 SET_VECTOR(set_vector, set_matrix, float64_t)
00819 SET_VECTOR(set_vector, set_longreal_matrix, floatmax_t)
00820 SET_VECTOR(set_vector, set_matrix, int16_t)
00821 SET_VECTOR(set_vector, set_matrix, uint16_t)
00822 SET_VECTOR(set_vector, set_long_matrix, int64_t)
00823 SET_VECTOR(set_vector, set_ulong_matrix, uint64_t)
00824 #undef SET_VECTOR
00825
00826 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
00827 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
00828 { \
00829 if (!(file && matrix)) \
00830 SG_ERROR("File or matrix invalid.\n"); \
00831 \
00832 for (int32_t i=0; i<num_vec; i++) \
00833 { \
00834 for (int32_t j=0; j<num_feat; j++) \
00835 { \
00836 sg_type v=matrix[num_feat*i+j]; \
00837 if (j==num_feat-1) \
00838 fprintf(file, type_str "\n", (fprt_type) v); \
00839 else \
00840 fprintf(file, type_str " ", (fprt_type) v); \
00841 } \
00842 } \
00843 }
00844 SET_MATRIX(set_matrix, char, char, "%c")
00845 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
00846 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
00847 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
00848 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
00849 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
00850 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
00851 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
00852 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
00853 SET_MATRIX(set_matrix, float32_t, float32_t, "%.16g")
00854 SET_MATRIX(set_matrix, float64_t, float64_t, "%.16lg")
00855 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%.16Lg")
00856 #undef SET_MATRIX
00857
00858 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
00859 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims) \
00860 { \
00861 if (!(file && array)) \
00862 SG_ERROR("File or data invalid.\n"); \
00863 \
00864 size_t total = 1; \
00865 for(int i = 0;i < num_dims;i++) \
00866 total *= dims[i]; \
00867 int32_t block_size = dims[num_dims-1]; \
00868 \
00869 fprintf(file,"%d ",num_dims); \
00870 for(int i = 0;i < num_dims;i++) \
00871 fprintf(file,"%d ",dims[i]); \
00872 fprintf(file,"\n"); \
00873 \
00874 for (size_t i=0; i < total; i++) \
00875 { \
00876 sg_type v= array[i]; \
00877 if ( ((i+1) % block_size) == 0) \
00878 fprintf(file, type_str "\n", (fprt_type) v); \
00879 else \
00880 fprintf(file, type_str " ", (fprt_type) v); \
00881 } \
00882 }
00883
00884 SET_NDARRAY(set_ndarray, char, char, "%c")
00885 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u")
00886 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
00887 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i")
00888 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
00889 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli")
00890 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu")
00891 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i")
00892 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u")
00893 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
00894 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
00895 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
00896 #undef SET_NDARRAY
00897
00898 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
00899 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
00900 { \
00901 if (!(file && matrix)) \
00902 SG_ERROR("File or matrix invalid.\n"); \
00903 \
00904 for (int32_t i=0; i<num_vec; i++) \
00905 { \
00906 SGSparseVectorEntry<sg_type>* vec = matrix[i].features; \
00907 int32_t len=matrix[i].num_feat_entries; \
00908 \
00909 for (int32_t j=0; j<len; j++) \
00910 { \
00911 if (j<len-1) \
00912 { \
00913 fprintf(file, "%d:" type_str " ", \
00914 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
00915 } \
00916 else \
00917 { \
00918 fprintf(file, "%d:" type_str "\n", \
00919 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
00920 } \
00921 } \
00922 } \
00923 }
00924 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u")
00925 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c")
00926 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u")
00927 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
00928 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i")
00929 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
00930 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
00931 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
00932 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i")
00933 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u")
00934 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f")
00935 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f")
00936 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
00937 #undef SET_SPARSEMATRIX
00938
00939 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str)
00940 {
00941 if (!(file && strings))
00942 SG_ERROR("File or strings invalid.\n");
00943
00944 for (int32_t i=0; i<num_str; i++)
00945 {
00946 int32_t len = strings[i].slen;
00947 fwrite(strings[i].string, sizeof(uint8_t), len, file);
00948 fprintf(file, "\n");
00949 }
00950 }
00951
00952 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str)
00953 {
00954 if (!(file && strings))
00955 SG_ERROR("File or strings invalid.\n");
00956
00957 for (int32_t i=0; i<num_str; i++)
00958 {
00959 int32_t len = strings[i].slen;
00960 fwrite(strings[i].string, sizeof(int8_t), len, file);
00961 fprintf(file, "\n");
00962 }
00963 }
00964
00965 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str)
00966 {
00967 if (!(file && strings))
00968 SG_ERROR("File or strings invalid.\n");
00969
00970 for (int32_t i=0; i<num_str; i++)
00971 {
00972 int32_t len = strings[i].slen;
00973 fwrite(strings[i].string, sizeof(char), len, file);
00974 fprintf(file, "\n");
00975 }
00976 }
00977
00978 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str)
00979 {
00980 }
00981
00982 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str)
00983 {
00984 }
00985
00986 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str)
00987 {
00988 }
00989
00990 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str)
00991 {
00992 }
00993
00994 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str)
00995 {
00996 }
00997
00998 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str)
00999 {
01000 }
01001
01002 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str)
01003 {
01004 }
01005
01006 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str)
01007 {
01008 }
01009
01010 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str)
01011 {
01012 }
01013
01014 template <class T> void CAsciiFile::append_item(
01015 DynArray<T>* items, char* ptr_data, char* ptr_item)
01016 {
01017 size_t len=(ptr_data-ptr_item)/sizeof(char);
01018 char* item=SG_MALLOC(char, len+1);
01019 memset(item, 0, sizeof(char)*(len+1));
01020 item=strncpy(item, ptr_item, len);
01021
01022 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
01023 items->append_element(item);
01024 }
01025
01026 #if defined(__MACH__) || defined(FREEBSD)
01027 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01028 {
01029 int32_t total_bytes_read=0;
01030 int32_t default_size=10;
01031
01032 if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
01033 return -1;
01034
01035 if ((*lineptr == NULL) && (*n == 0))
01036 {
01037 *lineptr=SG_MALLOC(char, default_size);
01038 *n=default_size;
01039 }
01040
01041 int32_t bytes_read, pos=-1;
01042 size_t threshold_size=100000;
01043
01044 while (1)
01045 {
01046
01047 if (*n > threshold_size)
01048 return -1;
01049
01050
01051 bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);
01052
01053 for (int i=0; i<bytes_read; i++)
01054 {
01055 if ((*lineptr)[total_bytes_read+i] == delimiter)
01056 {
01057 pos=i;
01058 break;
01059 }
01060 }
01061
01062 if (pos==-1)
01063 {
01064 if (feof(stream))
01065 return -1;
01066 total_bytes_read+=bytes_read;
01067 *lineptr=SG_REALLOC(char, *lineptr, (*n)*2);
01068 *n=(*n)*2;
01069
01070 }
01071 else
01072 {
01073 total_bytes_read+=pos+1;
01074 (*lineptr)[total_bytes_read]='\0';
01075
01076 fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
01077 return total_bytes_read;
01078 }
01079 }
01080 }
01081
01082 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01083 {
01084 return getdelim(lineptr, n, '\n', stream);
01085 }
01086
01087 #else
01088 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01089 {
01090 return ::getdelim(lineptr, n, delimiter, stream);
01091 }
01092
01093 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01094 {
01095 return ::getline(lineptr, n, stream);
01096 }
01097 #endif
01098
01099 void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
01100 {
01101 ret.erase();
01102 char *last = s.start;
01103 for (; s.start != s.end; s.start++)
01104 {
01105 if (*s.start == delim)
01106 {
01107 if (s.start != last)
01108 {
01109 substring temp = {last,s.start};
01110 ret.push(temp);
01111 }
01112 last = s.start+1;
01113 }
01114 }
01115 if (s.start != last)
01116 {
01117 substring final = {last, s.start};
01118 ret.push(final);
01119 }
01120 }