00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include <shogun/features/SparseFeatures.h>
00016 #include <shogun/io/File.h>
00017 #include <shogun/io/AsciiFile.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <ctype.h>
00020 #include <stdio.h>
00021
00022 using namespace shogun;
00023
00024 CAsciiFile::CAsciiFile()
00025 {
00026 SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n");
00027 }
00028
00029 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
00030 {
00031 }
00032
00033 CAsciiFile::CAsciiFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00034 {
00035 }
00036
00037 CAsciiFile::~CAsciiFile()
00038 {
00039 }
00040
00041 #define GET_VECTOR(fname, mfname, sg_type) \
00042 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
00043 { \
00044 vec=NULL; \
00045 len=0; \
00046 int32_t num_feat=0; \
00047 int32_t num_vec=0; \
00048 mfname(vec, num_feat, num_vec); \
00049 if ((num_feat==1) || (num_vec==1)) \
00050 { \
00051 if (num_feat==1) \
00052 len=num_vec; \
00053 else \
00054 len=num_feat; \
00055 } \
00056 else \
00057 { \
00058 SG_FREE(vec); \
00059 vec=NULL; \
00060 len=0; \
00061 SG_ERROR("Could not read vector from" \
00062 " file %s (shape %dx%d found but " \
00063 "vector expected).\n", filename, \
00064 num_vec, num_feat); \
00065 } \
00066 }
00067
00068 GET_VECTOR(get_vector, get_matrix, uint8_t)
00069 GET_VECTOR(get_vector, get_matrix, char)
00070 GET_VECTOR(get_vector, get_matrix, int32_t)
00071 GET_VECTOR(get_vector, get_matrix, float32_t)
00072 GET_VECTOR(get_vector, get_matrix, float64_t)
00073 GET_VECTOR(get_vector, get_matrix, int16_t)
00074 GET_VECTOR(get_vector, get_matrix, uint16_t)
00075 #undef GET_VECTOR
00076
00077 #define GET_MATRIX(fname, conv, sg_type) \
00078 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
00079 { \
00080 struct stat stats; \
00081 if (stat(filename, &stats)!=0) \
00082 SG_ERROR("Could not get file statistics.\n"); \
00083 \
00084 char* data=SG_MALLOC(char, stats.st_size+1); \
00085 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
00086 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
00087 if (nread<=0) \
00088 SG_ERROR("Could not read data from %s.\n", filename); \
00089 \
00090 SG_DEBUG("data read from file:\n%s\n", data); \
00091 \
00092 \
00093 int32_t nf=0; \
00094 num_feat=0; \
00095 num_vec=0; \
00096 char* ptr_item=NULL; \
00097 char* ptr_data=data; \
00098 DynArray<char*>* items=new DynArray<char*>(); \
00099 \
00100 while (*ptr_data) \
00101 { \
00102 if (*ptr_data=='\n') \
00103 { \
00104 if (ptr_item) \
00105 nf++; \
00106 \
00107 if (num_feat!=0 && nf!=num_feat) \
00108 SG_ERROR("Number of features mismatches (%d != %d) in vector" \
00109 " %d in file %s.\n", num_feat, nf, num_vec, filename); \
00110 \
00111 append_item(items, ptr_data, ptr_item); \
00112 num_feat=nf; \
00113 num_vec++; \
00114 nf=0; \
00115 ptr_item=NULL; \
00116 } \
00117 else if (!isblank(*ptr_data) && !ptr_item) \
00118 { \
00119 ptr_item=ptr_data; \
00120 } \
00121 else if (isblank(*ptr_data) && ptr_item) \
00122 { \
00123 append_item(items, ptr_data, ptr_item); \
00124 ptr_item=NULL; \
00125 nf++; \
00126 } \
00127 \
00128 ptr_data++; \
00129 } \
00130 \
00131 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec); \
00132 SG_FREE(data); \
00133 \
00134 \
00135 matrix=SG_MALLOC(sg_type, num_vec*num_feat); \
00136 for (int32_t i=0; i<num_vec; i++) \
00137 { \
00138 for (int32_t j=0; j<num_feat; j++) \
00139 { \
00140 char* item=items->get_element(i*num_feat+j); \
00141 matrix[i*num_feat+j]=conv(item); \
00142 SG_FREE(item); \
00143 } \
00144 } \
00145 delete items; \
00146 }
00147
00148 GET_MATRIX(get_matrix, atoi, uint8_t)
00149 GET_MATRIX(get_int8_matrix, atoi, int8_t)
00150 GET_MATRIX(get_matrix, atoi, char)
00151 GET_MATRIX(get_matrix, atoi, int32_t)
00152 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
00153 GET_MATRIX(get_long_matrix, atoll, int64_t)
00154 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
00155 GET_MATRIX(get_matrix, atof, float32_t)
00156 GET_MATRIX(get_matrix, atof, float64_t)
00157 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
00158 GET_MATRIX(get_matrix, atoi, int16_t)
00159 GET_MATRIX(get_matrix, atoi, uint16_t)
00160 #undef GET_MATRIX
00161
00162 #define GET_NDARRAY(fname, conv, sg_type) \
00163 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims) \
00164 { \
00165 struct stat stats; \
00166 if (stat(filename, &stats)!=0) \
00167 SG_ERROR("Could not get file statistics.\n"); \
00168 \
00169 char* data=SG_MALLOC(char, stats.st_size+1); \
00170 memset(data, 0, sizeof(char)*(stats.st_size+1)); \
00171 size_t nread=fread(data, sizeof(char), stats.st_size, file); \
00172 if (nread<=0) \
00173 SG_ERROR("Could not read data from %s.\n", filename); \
00174 \
00175 SG_DEBUG("data read from file:\n%s\n", data); \
00176 \
00177 \
00178 int32_t length=0; \
00179 int32_t counter=0; \
00180 size_t total=0; \
00181 num_dims = -1; \
00182 char* ptr_item=NULL; \
00183 char* ptr_data=data; \
00184 DynArray<char*>* items=new DynArray<char*>(); \
00185 \
00186 \
00187 while(*ptr_data != '\n') \
00188 { \
00189 if(isblank(*ptr_data) && ptr_item) \
00190 { \
00191 append_item(items, ptr_data, ptr_item); \
00192 num_dims++; \
00193 ptr_item = NULL; \
00194 } \
00195 else if(!isblank(*ptr_data) && !ptr_item) \
00196 ptr_item = ptr_data; \
00197 \
00198 ptr_data++; \
00199 } \
00200 ptr_item = NULL; \
00201 ptr_data++; \
00202 \
00203 \
00204 while(*ptr_data) \
00205 { \
00206 if (*ptr_data=='\n') \
00207 { \
00208 if (ptr_item) \
00209 counter++; \
00210 \
00211 if (length!=0 && counter!=length) \
00212 SG_ERROR("Invalid number of data (%d != %d) in line" \
00213 " %d in file %s.\n", length, counter, total, filename); \
00214 \
00215 append_item(items, ptr_data, ptr_item); \
00216 length=counter; \
00217 total++; \
00218 counter=0; \
00219 ptr_item=NULL; \
00220 } \
00221 else if (!isblank(*ptr_data) && !ptr_item) \
00222 { \
00223 ptr_item=ptr_data; \
00224 } \
00225 else if (isblank(*ptr_data) && ptr_item) \
00226 { \
00227 append_item(items, ptr_data, ptr_item); \
00228 ptr_item=NULL; \
00229 counter++; \
00230 } \
00231 \
00232 ptr_data++; \
00233 } \
00234 \
00235 SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total); \
00236 SG_FREE(data); \
00237 \
00238 \
00239 char * item; \
00240 item=items->get_element(0); \
00241 if(atoi(item) != num_dims) \
00242 SG_ERROR("Invalid number of dimensions!\n"); \
00243 SG_FREE(item); \
00244 dims = SG_MALLOC(int32_t, num_dims); \
00245 for(int32_t i =0;i < num_dims;i++) \
00246 { \
00247 item = items->get_element(i+1); \
00248 dims[i] = atoi(item); \
00249 SG_FREE(item); \
00250 } \
00251 if (dims[num_dims-1] != length) \
00252 SG_ERROR("Invalid number of lines in file!\n"); \
00253 \
00254 \
00255 total *= length; \
00256 array=SG_MALLOC(sg_type, total); \
00257 for (size_t i=0; i<total; i++) \
00258 { \
00259 item=items->get_element(i+(num_dims+1)); \
00260 array[i]=conv(item); \
00261 SG_FREE(item); \
00262 } \
00263 delete items; \
00264 }
00265
00266 GET_NDARRAY(get_ndarray, atoi, uint8_t)
00267 GET_NDARRAY(get_int8_ndarray, atoi, int8_t)
00268 GET_NDARRAY(get_ndarray, atoi, char)
00269 GET_NDARRAY(get_ndarray, atoi, int32_t)
00270 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t)
00271 GET_NDARRAY(get_long_ndarray, atoll, int64_t)
00272 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t)
00273 GET_NDARRAY(get_ndarray, atof, float32_t)
00274 GET_NDARRAY(get_ndarray, atof, float64_t)
00275 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t)
00276 GET_NDARRAY(get_ndarray, atoi, int16_t)
00277 GET_NDARRAY(get_ndarray, atoi, uint16_t)
00278 #undef GET_NDARRAY
00279
00280 #define GET_SPARSEMATRIX(fname, conv, sg_type) \
00281 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00282 { \
00283 size_t blocksize=1024*1024; \
00284 size_t required_blocksize=blocksize; \
00285 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); \
00286 \
00287 if (file) \
00288 { \
00289 num_vec=0; \
00290 num_feat=0; \
00291 \
00292 SG_INFO("counting line numbers in file %s\n", filename); \
00293 size_t sz=blocksize; \
00294 size_t block_offs=0; \
00295 size_t old_block_offs=0; \
00296 fseek(file, 0, SEEK_END); \
00297 size_t fsize=ftell(file); \
00298 rewind(file); \
00299 \
00300 while (sz == blocksize) \
00301 { \
00302 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
00303 for (size_t i=0; i<sz; i++) \
00304 { \
00305 block_offs++; \
00306 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
00307 { \
00308 num_vec++; \
00309 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
00310 old_block_offs=block_offs; \
00311 } \
00312 } \
00313 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); \
00314 } \
00315 \
00316 SG_INFO("found %d feature vectors\n", num_vec); \
00317 SG_FREE(dummy); \
00318 blocksize=required_blocksize; \
00319 dummy = SG_MALLOC(uint8_t, blocksize+1); \
00320 matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
00321 \
00322 rewind(file); \
00323 sz=blocksize; \
00324 int32_t lines=0; \
00325 while (sz == blocksize) \
00326 { \
00327 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \
00328 \
00329 size_t old_sz=0; \
00330 for (size_t i=0; i<sz; i++) \
00331 { \
00332 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
00333 { \
00334 size_t len=i-old_sz+1; \
00335 uint8_t* data=&dummy[old_sz]; \
00336 \
00337 for (size_t j=0; j<len; j++) \
00338 dummy[j]=data[j]; \
00339 \
00340 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file); \
00341 i=0; \
00342 old_sz=0; \
00343 sz+=len; \
00344 } \
00345 \
00346 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \
00347 { \
00348 \
00349 size_t len=i-old_sz; \
00350 uint8_t* data=&dummy[old_sz]; \
00351 \
00352 int32_t dims=0; \
00353 for (size_t j=0; j<len; j++) \
00354 { \
00355 if (data[j]==':') \
00356 dims++; \
00357 } \
00358 \
00359 if (dims<=0) \
00360 { \
00361 SG_ERROR("Error in line %d - number of" \
00362 " dimensions is %d line is %d characters" \
00363 " long\n line_content:'%.*s'\n", lines, \
00364 dims, len, len, (const char*) data); \
00365 } \
00366 \
00367 SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims); \
00368 \
00369 \
00370 size_t j=0; \
00371 for (; j<len; j++) \
00372 { \
00373 if (data[j]==':') \
00374 { \
00375 j=-1; \
00376 break; \
00377 } \
00378 \
00379 if (data[j]==' ') \
00380 { \
00381 data[j]='\0'; \
00382 \
00383 \
00384 break; \
00385 } \
00386 } \
00387 \
00388 int32_t d=0; \
00389 j++; \
00390 uint8_t* start=&data[j]; \
00391 for (; j<len; j++) \
00392 { \
00393 if (data[j]==':') \
00394 { \
00395 data[j]='\0'; \
00396 \
00397 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; \
00398 num_feat=CMath::max(num_feat, feat[d].feat_index+1); \
00399 \
00400 j++; \
00401 start=&data[j]; \
00402 for (; j<len; j++) \
00403 { \
00404 if (data[j]==' ' || data[j]=='\n') \
00405 { \
00406 data[j]='\0'; \
00407 feat[d].entry=(sg_type) conv((const char*) start); \
00408 d++; \
00409 break; \
00410 } \
00411 } \
00412 \
00413 if (j==len) \
00414 { \
00415 data[j]='\0'; \
00416 feat[dims-1].entry=(sg_type) conv((const char*) start); \
00417 } \
00418 \
00419 j++; \
00420 start=&data[j]; \
00421 } \
00422 } \
00423 \
00424 matrix[lines].vec_index=lines; \
00425 matrix[lines].num_feat_entries=dims; \
00426 matrix[lines].features=feat; \
00427 \
00428 old_sz=i+1; \
00429 lines++; \
00430 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t"); \
00431 } \
00432 } \
00433 } \
00434 \
00435 SG_INFO("file successfully read\n"); \
00436 } \
00437 \
00438 SG_FREE(dummy); \
00439 }
00440
00441 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool)
00442 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t)
00443 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
00444 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char)
00445 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t)
00446 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
00447 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
00448 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
00449 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t)
00450 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t)
00451 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
00452 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t)
00453 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t)
00454 #undef GET_SPARSEMATRIX
00455
00456
00457 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00458 {
00459 size_t blocksize=1024*1024;
00460 size_t required_blocksize=0;
00461 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00462 uint8_t* overflow=NULL;
00463 int32_t overflow_len=0;
00464
00465 if (file)
00466 {
00467 num_str=0;
00468 max_string_len=0;
00469
00470 SG_INFO("counting line numbers in file %s\n", filename);
00471 size_t sz=blocksize;
00472 size_t block_offs=0;
00473 size_t old_block_offs=0;
00474 fseek(file, 0, SEEK_END);
00475 size_t fsize=ftell(file);
00476 rewind(file);
00477
00478 while (sz == blocksize)
00479 {
00480 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00481 for (size_t i=0; i<sz; i++)
00482 {
00483 block_offs++;
00484 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00485 {
00486 num_str++;
00487 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00488 old_block_offs=block_offs;
00489 }
00490 }
00491 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00492 }
00493
00494 SG_INFO("found %d strings\n", num_str);
00495 SG_DEBUG("block_size=%d\n", required_blocksize);
00496 SG_FREE(dummy);
00497 blocksize=required_blocksize;
00498 dummy=SG_MALLOC(uint8_t, blocksize);
00499 overflow=SG_MALLOC(uint8_t, blocksize);
00500 strings=SG_MALLOC(SGString<uint8_t>, num_str);
00501
00502 rewind(file);
00503 sz=blocksize;
00504 int32_t lines=0;
00505 size_t old_sz=0;
00506 while (sz == blocksize)
00507 {
00508 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00509
00510 old_sz=0;
00511 for (size_t i=0; i<sz; i++)
00512 {
00513 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00514 {
00515 int32_t len=i-old_sz;
00516 max_string_len=CMath::max(max_string_len, len+overflow_len);
00517
00518 strings[lines].slen=len+overflow_len;
00519 strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len);
00520
00521 for (int32_t j=0; j<overflow_len; j++)
00522 strings[lines].string[j]=overflow[j];
00523 for (int32_t j=0; j<len; j++)
00524 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00525
00526
00527 overflow_len=0;
00528
00529
00530 old_sz=i+1;
00531 lines++;
00532 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00533 }
00534 }
00535
00536 for (size_t i=old_sz; i<sz; i++)
00537 overflow[i-old_sz]=dummy[i];
00538
00539 overflow_len=sz-old_sz;
00540 }
00541 SG_INFO("file successfully read\n");
00542 SG_INFO("max_string_length=%d\n", max_string_len);
00543 SG_INFO("num_strings=%d\n", num_str);
00544 }
00545
00546 SG_FREE(dummy);
00547 SG_FREE(overflow);
00548 }
00549
00550 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00551 {
00552 size_t blocksize=1024*1024;
00553 size_t required_blocksize=0;
00554 int8_t* dummy=SG_MALLOC(int8_t, blocksize);
00555 int8_t* overflow=NULL;
00556 int32_t overflow_len=0;
00557
00558 if (file)
00559 {
00560 num_str=0;
00561 max_string_len=0;
00562
00563 SG_INFO("counting line numbers in file %s\n", filename);
00564 size_t sz=blocksize;
00565 size_t block_offs=0;
00566 size_t old_block_offs=0;
00567 fseek(file, 0, SEEK_END);
00568 size_t fsize=ftell(file);
00569 rewind(file);
00570
00571 while (sz == blocksize)
00572 {
00573 sz=fread(dummy, sizeof(int8_t), blocksize, file);
00574 for (size_t i=0; i<sz; i++)
00575 {
00576 block_offs++;
00577 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00578 {
00579 num_str++;
00580 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00581 old_block_offs=block_offs;
00582 }
00583 }
00584 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00585 }
00586
00587 SG_INFO("found %d strings\n", num_str);
00588 SG_DEBUG("block_size=%d\n", required_blocksize);
00589 SG_FREE(dummy);
00590 blocksize=required_blocksize;
00591 dummy=SG_MALLOC(int8_t, blocksize);
00592 overflow=SG_MALLOC(int8_t, blocksize);
00593 strings=SG_MALLOC(SGString<int8_t>, num_str);
00594
00595 rewind(file);
00596 sz=blocksize;
00597 int32_t lines=0;
00598 size_t old_sz=0;
00599 while (sz == blocksize)
00600 {
00601 sz=fread(dummy, sizeof(int8_t), blocksize, file);
00602
00603 old_sz=0;
00604 for (size_t i=0; i<sz; i++)
00605 {
00606 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00607 {
00608 int32_t len=i-old_sz;
00609 max_string_len=CMath::max(max_string_len, len+overflow_len);
00610
00611 strings[lines].slen=len+overflow_len;
00612 strings[lines].string=SG_MALLOC(int8_t, len+overflow_len);
00613
00614 for (int32_t j=0; j<overflow_len; j++)
00615 strings[lines].string[j]=overflow[j];
00616 for (int32_t j=0; j<len; j++)
00617 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00618
00619
00620 overflow_len=0;
00621
00622
00623 old_sz=i+1;
00624 lines++;
00625 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00626 }
00627 }
00628
00629 for (size_t i=old_sz; i<sz; i++)
00630 overflow[i-old_sz]=dummy[i];
00631
00632 overflow_len=sz-old_sz;
00633 }
00634 SG_INFO("file successfully read\n");
00635 SG_INFO("max_string_length=%d\n", max_string_len);
00636 SG_INFO("num_strings=%d\n", num_str);
00637 }
00638
00639 SG_FREE(dummy);
00640 SG_FREE(overflow);
00641 }
00642
00643 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00644 {
00645 size_t blocksize=1024*1024;
00646 size_t required_blocksize=0;
00647 char* dummy=SG_MALLOC(char, blocksize);
00648 char* overflow=NULL;
00649 int32_t overflow_len=0;
00650
00651 if (file)
00652 {
00653 num_str=0;
00654 max_string_len=0;
00655
00656 SG_INFO("counting line numbers in file %s\n", filename);
00657 size_t sz=blocksize;
00658 size_t block_offs=0;
00659 size_t old_block_offs=0;
00660 fseek(file, 0, SEEK_END);
00661 size_t fsize=ftell(file);
00662 rewind(file);
00663
00664 while (sz == blocksize)
00665 {
00666 sz=fread(dummy, sizeof(char), blocksize, file);
00667 for (size_t i=0; i<sz; i++)
00668 {
00669 block_offs++;
00670 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00671 {
00672 num_str++;
00673 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00674 old_block_offs=block_offs;
00675 }
00676 }
00677 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00678 }
00679
00680 SG_INFO("found %d strings\n", num_str);
00681 SG_DEBUG("block_size=%d\n", required_blocksize);
00682 SG_FREE(dummy);
00683 blocksize=required_blocksize;
00684 dummy=SG_MALLOC(char, blocksize);
00685 overflow=SG_MALLOC(char, blocksize);
00686 strings=SG_MALLOC(SGString<char>, num_str);
00687
00688 rewind(file);
00689 sz=blocksize;
00690 int32_t lines=0;
00691 size_t old_sz=0;
00692 while (sz == blocksize)
00693 {
00694 sz=fread(dummy, sizeof(char), blocksize, file);
00695
00696 old_sz=0;
00697 for (size_t i=0; i<sz; i++)
00698 {
00699 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00700 {
00701 int32_t len=i-old_sz;
00702 max_string_len=CMath::max(max_string_len, len+overflow_len);
00703
00704 strings[lines].slen=len+overflow_len;
00705 strings[lines].string=SG_MALLOC(char, len+overflow_len);
00706
00707 for (int32_t j=0; j<overflow_len; j++)
00708 strings[lines].string[j]=overflow[j];
00709 for (int32_t j=0; j<len; j++)
00710 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00711
00712
00713 overflow_len=0;
00714
00715
00716 old_sz=i+1;
00717 lines++;
00718 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00719 }
00720 }
00721
00722 for (size_t i=old_sz; i<sz; i++)
00723 overflow[i-old_sz]=dummy[i];
00724
00725 overflow_len=sz-old_sz;
00726 }
00727 SG_INFO("file successfully read\n");
00728 SG_INFO("max_string_length=%d\n", max_string_len);
00729 SG_INFO("num_strings=%d\n", num_str);
00730 }
00731
00732 SG_FREE(dummy);
00733 SG_FREE(overflow);
00734 }
00735
00736 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00737 {
00738 strings=NULL;
00739 num_str=0;
00740 max_string_len=0;
00741 }
00742
00743 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00744 {
00745 strings=NULL;
00746 num_str=0;
00747 max_string_len=0;
00748 }
00749
00750 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00751 {
00752 strings=NULL;
00753 num_str=0;
00754 max_string_len=0;
00755 }
00756
00757 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00758 {
00759 strings=NULL;
00760 num_str=0;
00761 max_string_len=0;
00762 }
00763
00764 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00765 {
00766 strings=NULL;
00767 num_str=0;
00768 max_string_len=0;
00769 }
00770
00771 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00772 {
00773 strings=NULL;
00774 num_str=0;
00775 max_string_len=0;
00776 }
00777
00778 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00779 {
00780 strings=NULL;
00781 num_str=0;
00782 max_string_len=0;
00783 }
00784
00785 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00786 {
00787 strings=NULL;
00788 num_str=0;
00789 max_string_len=0;
00790 }
00791
00792 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00793 {
00794 strings=NULL;
00795 num_str=0;
00796 max_string_len=0;
00797 }
00798
00799
00802 #define SET_VECTOR(fname, mfname, sg_type) \
00803 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
00804 { \
00805 mfname(vec, len, 1); \
00806 }
00807 SET_VECTOR(set_vector, set_matrix, uint8_t)
00808 SET_VECTOR(set_vector, set_matrix, char)
00809 SET_VECTOR(set_vector, set_matrix, int32_t)
00810 SET_VECTOR(set_vector, set_matrix, float32_t)
00811 SET_VECTOR(set_vector, set_matrix, float64_t)
00812 SET_VECTOR(set_vector, set_matrix, int16_t)
00813 SET_VECTOR(set_vector, set_matrix, uint16_t)
00814 #undef SET_VECTOR
00815
00816 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
00817 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
00818 { \
00819 if (!(file && matrix)) \
00820 SG_ERROR("File or matrix invalid.\n"); \
00821 \
00822 for (int32_t i=0; i<num_vec; i++) \
00823 { \
00824 for (int32_t j=0; j<num_feat; j++) \
00825 { \
00826 sg_type v=matrix[num_feat*i+j]; \
00827 if (j==num_feat-1) \
00828 fprintf(file, type_str "\n", (fprt_type) v); \
00829 else \
00830 fprintf(file, type_str " ", (fprt_type) v); \
00831 } \
00832 } \
00833 }
00834 SET_MATRIX(set_matrix, char, char, "%c")
00835 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
00836 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
00837 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
00838 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
00839 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
00840 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
00841 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
00842 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
00843 SET_MATRIX(set_matrix, float32_t, float32_t, "%f")
00844 SET_MATRIX(set_matrix, float64_t, float64_t, "%f")
00845 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%Lf")
00846 #undef SET_MATRIX
00847
00848 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
00849 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims) \
00850 { \
00851 if (!(file && array)) \
00852 SG_ERROR("File or data invalid.\n"); \
00853 \
00854 size_t total = 1; \
00855 for(int i = 0;i < num_dims;i++) \
00856 total *= dims[i]; \
00857 int32_t block_size = dims[num_dims-1]; \
00858 \
00859 fprintf(file,"%d ",num_dims); \
00860 for(int i = 0;i < num_dims;i++) \
00861 fprintf(file,"%d ",dims[i]); \
00862 fprintf(file,"\n"); \
00863 \
00864 for (size_t i=0; i < total; i++) \
00865 { \
00866 sg_type v= array[i]; \
00867 if ( ((i+1) % block_size) == 0) \
00868 fprintf(file, type_str "\n", (fprt_type) v); \
00869 else \
00870 fprintf(file, type_str " ", (fprt_type) v); \
00871 } \
00872 }
00873
00874 SET_NDARRAY(set_ndarray, char, char, "%c")
00875 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u")
00876 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
00877 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i")
00878 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
00879 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli")
00880 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu")
00881 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i")
00882 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u")
00883 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
00884 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
00885 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
00886 #undef SET_NDARRAY
00887
00888 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
00889 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
00890 { \
00891 if (!(file && matrix)) \
00892 SG_ERROR("File or matrix invalid.\n"); \
00893 \
00894 for (int32_t i=0; i<num_vec; i++) \
00895 { \
00896 SGSparseVectorEntry<sg_type>* vec = matrix[i].features; \
00897 int32_t len=matrix[i].num_feat_entries; \
00898 \
00899 for (int32_t j=0; j<len; j++) \
00900 { \
00901 if (j<len-1) \
00902 { \
00903 fprintf(file, "%d:" type_str " ", \
00904 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
00905 } \
00906 else \
00907 { \
00908 fprintf(file, "%d:" type_str "\n", \
00909 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \
00910 } \
00911 } \
00912 } \
00913 }
00914 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u")
00915 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c")
00916 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u")
00917 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
00918 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i")
00919 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
00920 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
00921 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
00922 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i")
00923 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u")
00924 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f")
00925 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f")
00926 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
00927 #undef SET_SPARSEMATRIX
00928
00929 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str)
00930 {
00931 if (!(file && strings))
00932 SG_ERROR("File or strings invalid.\n");
00933
00934 for (int32_t i=0; i<num_str; i++)
00935 {
00936 int32_t len = strings[i].slen;
00937 fwrite(strings[i].string, sizeof(uint8_t), len, file);
00938 fprintf(file, "\n");
00939 }
00940 }
00941
00942 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str)
00943 {
00944 if (!(file && strings))
00945 SG_ERROR("File or strings invalid.\n");
00946
00947 for (int32_t i=0; i<num_str; i++)
00948 {
00949 int32_t len = strings[i].slen;
00950 fwrite(strings[i].string, sizeof(int8_t), len, file);
00951 fprintf(file, "\n");
00952 }
00953 }
00954
00955 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str)
00956 {
00957 if (!(file && strings))
00958 SG_ERROR("File or strings invalid.\n");
00959
00960 for (int32_t i=0; i<num_str; i++)
00961 {
00962 int32_t len = strings[i].slen;
00963 fwrite(strings[i].string, sizeof(char), len, file);
00964 fprintf(file, "\n");
00965 }
00966 }
00967
00968 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str)
00969 {
00970 }
00971
00972 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str)
00973 {
00974 }
00975
00976 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str)
00977 {
00978 }
00979
00980 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str)
00981 {
00982 }
00983
00984 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str)
00985 {
00986 }
00987
00988 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str)
00989 {
00990 }
00991
00992 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str)
00993 {
00994 }
00995
00996 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str)
00997 {
00998 }
00999
01000 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str)
01001 {
01002 }
01003
01004 template <class T> void CAsciiFile::append_item(
01005 DynArray<T>* items, char* ptr_data, char* ptr_item)
01006 {
01007 size_t len=(ptr_data-ptr_item)/sizeof(char);
01008 char* item=SG_MALLOC(char, len+1);
01009 memset(item, 0, sizeof(char)*(len+1));
01010 item=strncpy(item, ptr_item, len);
01011
01012 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
01013 items->append_element(item);
01014 }
01015
01016 #ifdef __MACH__
01017 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01018 {
01019 int32_t total_bytes_read=0;
01020 int32_t default_size=10;
01021
01022 if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
01023 return -1;
01024
01025 if ((*lineptr == NULL) && (*n == 0))
01026 {
01027 *lineptr=SG_MALLOC(char, default_size);
01028 *n=default_size;
01029 }
01030
01031 int32_t bytes_read, pos=-1;
01032 int32_t threshold_size=100000;
01033
01034 while (1)
01035 {
01036
01037 if (*n > threshold_size)
01038 return -1;
01039
01040
01041 bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);
01042
01043 for (int i=0; i<bytes_read; i++)
01044 {
01045 if ((*lineptr)[total_bytes_read+i] == delimiter)
01046 {
01047 pos=i;
01048 break;
01049 }
01050 }
01051
01052 if (pos==-1)
01053 {
01054 if (feof(stream))
01055 return -1;
01056 total_bytes_read+=bytes_read;
01057 *lineptr=SG_REALLOC(char, *lineptr, (*n)*2);
01058 *n=(*n)*2;
01059
01060 }
01061 else
01062 {
01063 total_bytes_read+=pos+1;
01064 (*lineptr)[total_bytes_read]='\0';
01065
01066 fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
01067 return total_bytes_read;
01068 }
01069 }
01070 }
01071
01072 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01073 {
01074 return getdelim(lineptr, n, '\n', stream);
01075 }
01076
01077 #else
01078 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01079 {
01080 return ::getdelim(lineptr, n, delimiter, stream);
01081 }
01082
01083 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01084 {
01085 return ::getline(lineptr, n, stream);
01086 }
01087 #endif
01088
01089 void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
01090 {
01091 ret.erase();
01092 char *last = s.start;
01093 for (; s.start != s.end; s.start++)
01094 {
01095 if (*s.start == delim)
01096 {
01097 if (s.start != last)
01098 {
01099 substring temp = {last,s.start};
01100 ret.push(temp);
01101 }
01102 last = s.start+1;
01103 }
01104 }
01105 if (s.start != last)
01106 {
01107 substring final = {last, s.start};
01108 ret.push(final);
01109 }
01110 }