00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <shogun/io/StreamingAsciiFile.h>
00012 #include <shogun/mathematics/Math.h>
00013
00014 #include <ctype.h>
00015
00016 using namespace shogun;
00017
00018 CStreamingAsciiFile::CStreamingAsciiFile()
00019 : CStreamingFile()
00020 {
00021 SG_UNSTABLE("CStreamingAsciiFile::CStreamingAsciiFile()", "\n");
00022 }
00023
00024 CStreamingAsciiFile::CStreamingAsciiFile(char* fname, char rw)
00025 : CStreamingFile(fname, rw)
00026 {
00027 }
00028
00029 CStreamingAsciiFile::~CStreamingAsciiFile()
00030 {
00031 }
00032
00033
00034
00035 #define GET_VECTOR(fname, conv, sg_type) \
00036 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& num_feat) \
00037 { \
00038 char* buffer = NULL; \
00039 ssize_t bytes_read; \
00040 int32_t old_len = num_feat; \
00041 \
00042 bytes_read = buf->read_line(buffer); \
00043 \
00044 if (bytes_read<=0) \
00045 { \
00046 vector=NULL; \
00047 num_feat=-1; \
00048 return; \
00049 } \
00050 \
00051 \
00052 int32_t nf=0; \
00053 num_feat=0; \
00054 \
00055 char* ptr_item=NULL; \
00056 char* ptr_data=buffer; \
00057 DynArray<char*>* items=new DynArray<char*>(); \
00058 \
00059 while (*ptr_data) \
00060 { \
00061 if ((*ptr_data=='\n') || \
00062 (ptr_data - buffer >= bytes_read)) \
00063 { \
00064 if (ptr_item) \
00065 nf++; \
00066 \
00067 append_item(items, ptr_data, ptr_item); \
00068 num_feat=nf; \
00069 \
00070 nf=0; \
00071 ptr_item=NULL; \
00072 break; \
00073 } \
00074 else if (!isblank(*ptr_data) && !ptr_item) \
00075 { \
00076 ptr_item=ptr_data; \
00077 } \
00078 else if (isblank(*ptr_data) && ptr_item) \
00079 { \
00080 append_item(items, ptr_data, ptr_item); \
00081 ptr_item=NULL; \
00082 nf++; \
00083 } \
00084 \
00085 ptr_data++; \
00086 } \
00087 \
00088 SG_DEBUG("num_feat %d\n", num_feat); \
00089 \
00090 \
00091 if (old_len < num_feat) \
00092 vector=SG_REALLOC(sg_type, vector, num_feat); \
00093 \
00094 for (int32_t i=0; i<num_feat; i++) \
00095 { \
00096 char* item=items->get_element(i); \
00097 vector[i]=conv(item); \
00098 SG_FREE(item); \
00099 } \
00100 delete items; \
00101 }
00102
00103 GET_VECTOR(get_bool_vector, str_to_bool, bool)
00104 GET_VECTOR(get_byte_vector, atoi, uint8_t)
00105 GET_VECTOR(get_char_vector, atoi, char)
00106 GET_VECTOR(get_int_vector, atoi, int32_t)
00107 GET_VECTOR(get_short_vector, atoi, int16_t)
00108 GET_VECTOR(get_word_vector, atoi, uint16_t)
00109 GET_VECTOR(get_int8_vector, atoi, int8_t)
00110 GET_VECTOR(get_uint_vector, atoi, uint32_t)
00111 GET_VECTOR(get_long_vector, atoi, int64_t)
00112 GET_VECTOR(get_ulong_vector, atoi, uint64_t)
00113 GET_VECTOR(get_longreal_vector, atoi, floatmax_t)
00114 #undef GET_VECTOR
00115
00116 #define GET_FLOAT_VECTOR(sg_type) \
00117 void CStreamingAsciiFile::get_vector(sg_type*& vector, int32_t& len) \
00118 { \
00119 char *line=NULL; \
00120 int32_t num_chars = buf->read_line(line); \
00121 int32_t old_len = len; \
00122 \
00123 if (num_chars == 0) \
00124 { \
00125 len = -1; \
00126 return; \
00127 } \
00128 \
00129 substring example_string = {line, line + num_chars}; \
00130 \
00131 CAsciiFile::tokenize(' ', example_string, words); \
00132 \
00133 len = words.index(); \
00134 substring* feature_start = &words[0]; \
00135 \
00136 if (len > old_len) \
00137 vector = SG_REALLOC(sg_type, vector, len); \
00138 \
00139 int32_t j=0; \
00140 for (substring* i = feature_start; i != words.end; i++) \
00141 { \
00142 vector[j++] = float_of_substring(*i); \
00143 } \
00144 }
00145
00146 GET_FLOAT_VECTOR(float32_t)
00147 GET_FLOAT_VECTOR(float64_t)
00148 #undef GET_FLOAT_VECTOR
00149
00150
00151
00152 #define GET_VECTOR_AND_LABEL(fname, conv, sg_type) \
00153 void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& num_feat, float64_t& label) \
00154 { \
00155 char* buffer = NULL; \
00156 ssize_t bytes_read; \
00157 int32_t old_len = num_feat; \
00158 \
00159 bytes_read = buf->read_line(buffer); \
00160 \
00161 if (bytes_read<=0) \
00162 { \
00163 vector=NULL; \
00164 num_feat=-1; \
00165 return; \
00166 } \
00167 \
00168 \
00169 int32_t nf=0; \
00170 num_feat=0; \
00171 \
00172 char* ptr_item=NULL; \
00173 char* ptr_data=buffer; \
00174 DynArray<char*>* items=new DynArray<char*>(); \
00175 \
00176 while (*ptr_data) \
00177 { \
00178 if ((*ptr_data=='\n') || \
00179 (ptr_data - buffer >= bytes_read)) \
00180 { \
00181 if (ptr_item) \
00182 nf++; \
00183 \
00184 append_item(items, ptr_data, ptr_item); \
00185 num_feat=nf; \
00186 \
00187 nf=0; \
00188 ptr_item=NULL; \
00189 break; \
00190 } \
00191 else if (!isblank(*ptr_data) && !ptr_item) \
00192 { \
00193 ptr_item=ptr_data; \
00194 } \
00195 else if (isblank(*ptr_data) && ptr_item) \
00196 { \
00197 append_item(items, ptr_data, ptr_item); \
00198 ptr_item=NULL; \
00199 nf++; \
00200 } \
00201 \
00202 ptr_data++; \
00203 } \
00204 \
00205 SG_DEBUG("num_feat %d\n", num_feat); \
00206 \
00207 label=atof(items->get_element(0)); \
00208 \
00209 if (old_len < num_feat - 1) \
00210 vector=SG_REALLOC(sg_type, vector, num_feat-1); \
00211 \
00212 for (int32_t i=1; i<num_feat; i++) \
00213 { \
00214 char* item=items->get_element(i); \
00215 vector[i-1]=conv(item); \
00216 SG_FREE(item); \
00217 } \
00218 delete items; \
00219 num_feat--; \
00220 }
00221
00222 GET_VECTOR_AND_LABEL(get_bool_vector_and_label, str_to_bool, bool)
00223 GET_VECTOR_AND_LABEL(get_byte_vector_and_label, atoi, uint8_t)
00224 GET_VECTOR_AND_LABEL(get_char_vector_and_label, atoi, char)
00225 GET_VECTOR_AND_LABEL(get_int_vector_and_label, atoi, int32_t)
00226 GET_VECTOR_AND_LABEL(get_short_vector_and_label, atoi, int16_t)
00227 GET_VECTOR_AND_LABEL(get_word_vector_and_label, atoi, uint16_t)
00228 GET_VECTOR_AND_LABEL(get_int8_vector_and_label, atoi, int8_t)
00229 GET_VECTOR_AND_LABEL(get_uint_vector_and_label, atoi, uint32_t)
00230 GET_VECTOR_AND_LABEL(get_long_vector_and_label, atoi, int64_t)
00231 GET_VECTOR_AND_LABEL(get_ulong_vector_and_label, atoi, uint64_t)
00232 GET_VECTOR_AND_LABEL(get_longreal_vector_and_label, atoi, floatmax_t)
00233 #undef GET_VECTOR_AND_LABEL
00234
00235 #define GET_FLOAT_VECTOR_AND_LABEL(sg_type) \
00236 void CStreamingAsciiFile::get_vector_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
00237 { \
00238 char *line=NULL; \
00239 int32_t num_chars = buf->read_line(line); \
00240 int32_t old_len = len; \
00241 \
00242 if (num_chars == 0) \
00243 { \
00244 len = -1; \
00245 return; \
00246 } \
00247 \
00248 substring example_string = {line, line + num_chars}; \
00249 \
00250 CAsciiFile::tokenize(' ', example_string, words); \
00251 \
00252 label = float_of_substring(words[0]); \
00253 \
00254 len = words.index() - 1; \
00255 substring* feature_start = &words[1]; \
00256 \
00257 if (len > old_len) \
00258 vector = SG_REALLOC(sg_type, vector, len); \
00259 \
00260 int32_t j=0; \
00261 for (substring* i = feature_start; i != words.end; i++) \
00262 { \
00263 vector[j++] = float_of_substring(*i); \
00264 } \
00265 }
00266
00267 GET_FLOAT_VECTOR_AND_LABEL(float32_t)
00268 GET_FLOAT_VECTOR_AND_LABEL(float64_t)
00269 #undef GET_FLOAT_VECTOR_AND_LABEL
00270
00271
00272
00273 #define GET_STRING(fname, conv, sg_type) \
00274 void CStreamingAsciiFile::get_string(sg_type*& vector, int32_t& len) \
00275 { \
00276 char* buffer = NULL; \
00277 ssize_t bytes_read; \
00278 \
00279 bytes_read = buf->read_line(buffer); \
00280 \
00281 if (bytes_read<=1) \
00282 { \
00283 vector=NULL; \
00284 len=-1; \
00285 return; \
00286 } \
00287 \
00288 SG_DEBUG("Line read from the file:\n%s\n", buffer); \
00289 \
00290 if (buffer[bytes_read-1]=='\n') \
00291 { \
00292 len=bytes_read-1; \
00293 buffer[bytes_read-1]='\0'; \
00294 } \
00295 else \
00296 len=bytes_read; \
00297 vector=(sg_type *) buffer; \
00298 }
00299
00300 GET_STRING(get_bool_string, str_to_bool, bool)
00301 GET_STRING(get_byte_string, atoi, uint8_t)
00302 GET_STRING(get_char_string, atoi, char)
00303 GET_STRING(get_int_string, atoi, int32_t)
00304 GET_STRING(get_shortreal_string, atof, float32_t)
00305 GET_STRING(get_real_string, atof, float64_t)
00306 GET_STRING(get_short_string, atoi, int16_t)
00307 GET_STRING(get_word_string, atoi, uint16_t)
00308 GET_STRING(get_int8_string, atoi, int8_t)
00309 GET_STRING(get_uint_string, atoi, uint32_t)
00310 GET_STRING(get_long_string, atoi, int64_t)
00311 GET_STRING(get_ulong_string, atoi, uint64_t)
00312 GET_STRING(get_longreal_string, atoi, floatmax_t)
00313 #undef GET_STRING
00314
00315
00316
00317 #define GET_STRING_AND_LABEL(fname, conv, sg_type) \
00318 void CStreamingAsciiFile::get_string_and_label(sg_type*& vector, int32_t& len, float64_t& label) \
00319 { \
00320 char* buffer = NULL; \
00321 ssize_t bytes_read; \
00322 \
00323 bytes_read = buf->read_line(buffer); \
00324 \
00325 if (bytes_read<=1) \
00326 { \
00327 vector=NULL; \
00328 len=-1; \
00329 return; \
00330 } \
00331 \
00332 int32_t str_start_pos=-1; \
00333 \
00334 for (int32_t i=0; i<bytes_read; i++) \
00335 { \
00336 if (buffer[i] == ' ') \
00337 { \
00338 buffer[i]='\0'; \
00339 label=atoi(buffer); \
00340 buffer[i]=' '; \
00341 str_start_pos=i+1; \
00342 break; \
00343 } \
00344 } \
00345 \
00346 if (str_start_pos == -1) \
00347 { \
00348 vector=NULL; \
00349 len=-1; \
00350 return; \
00351 } \
00352 \
00353 if (buffer[bytes_read-1]=='\n') \
00354 { \
00355 buffer[bytes_read-1]='\0'; \
00356 len=bytes_read-str_start_pos-1; \
00357 } \
00358 else \
00359 len=bytes_read-str_start_pos; \
00360 \
00361 vector=(sg_type*) &buffer[str_start_pos]; \
00362 }
00363
00364 GET_STRING_AND_LABEL(get_bool_string_and_label, str_to_bool, bool)
00365 GET_STRING_AND_LABEL(get_byte_string_and_label, atoi, uint8_t)
00366 GET_STRING_AND_LABEL(get_char_string_and_label, atoi, char)
00367 GET_STRING_AND_LABEL(get_int_string_and_label, atoi, int32_t)
00368 GET_STRING_AND_LABEL(get_shortreal_string_and_label, atof, float32_t)
00369 GET_STRING_AND_LABEL(get_real_string_and_label, atof, float64_t)
00370 GET_STRING_AND_LABEL(get_short_string_and_label, atoi, int16_t)
00371 GET_STRING_AND_LABEL(get_word_string_and_label, atoi, uint16_t)
00372 GET_STRING_AND_LABEL(get_int8_string_and_label, atoi, int8_t)
00373 GET_STRING_AND_LABEL(get_uint_string_and_label, atoi, uint32_t)
00374 GET_STRING_AND_LABEL(get_long_string_and_label, atoi, int64_t)
00375 GET_STRING_AND_LABEL(get_ulong_string_and_label, atoi, uint64_t)
00376 GET_STRING_AND_LABEL(get_longreal_string_and_label, atoi, floatmax_t)
00377 #undef GET_STRING_AND_LABEL
00378
00379
00380
00381 #define GET_SPARSE_VECTOR(fname, conv, sg_type) \
00382 void CStreamingAsciiFile::get_sparse_vector(SGSparseVectorEntry<sg_type>*& vector, int32_t& len) \
00383 { \
00384 char* buffer = NULL; \
00385 ssize_t bytes_read; \
00386 \
00387 bytes_read = buf->read_line(buffer); \
00388 \
00389 if (bytes_read<=1) \
00390 { \
00391 vector=NULL; \
00392 len=-1; \
00393 return; \
00394 } \
00395 \
00396 \
00397 int32_t num_chars; \
00398 if (buffer[bytes_read-1]=='\n') \
00399 { \
00400 num_chars=bytes_read-1; \
00401 buffer[num_chars]='\0'; \
00402 } \
00403 else \
00404 num_chars=bytes_read; \
00405 \
00406 int32_t num_dims=0; \
00407 for (int32_t i=0; i<num_chars; i++) \
00408 { \
00409 if (buffer[i]==':') \
00410 { \
00411 num_dims++; \
00412 } \
00413 } \
00414 \
00415 int32_t index_start_pos=-1; \
00416 int32_t feature_start_pos; \
00417 int32_t current_feat=0; \
00418 vector=SG_MALLOC(SGSparseVectorEntry<sg_type>, num_dims); \
00419 for (int32_t i=0; i<num_chars; i++) \
00420 { \
00421 if (buffer[i]==':') \
00422 { \
00423 buffer[i]='\0'; \
00424 vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
00425 \
00426 index_start_pos=-1; \
00427 \
00428 feature_start_pos=i+1; \
00429 while ((buffer[i]!=' ') && (i<num_chars)) \
00430 { \
00431 i++; \
00432 } \
00433 \
00434 buffer[i]='\0'; \
00435 vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
00436 \
00437 current_feat++; \
00438 } \
00439 else if (buffer[i]==' ') \
00440 i++; \
00441 else \
00442 { \
00443 \
00444 \
00445 \
00446 if (index_start_pos == -1) \
00447 index_start_pos=i; \
00448 } \
00449 } \
00450 \
00451 len=current_feat; \
00452 }
00453
00454 GET_SPARSE_VECTOR(get_bool_sparse_vector, str_to_bool, bool)
00455 GET_SPARSE_VECTOR(get_byte_sparse_vector, atoi, uint8_t)
00456 GET_SPARSE_VECTOR(get_char_sparse_vector, atoi, char)
00457 GET_SPARSE_VECTOR(get_int_sparse_vector, atoi, int32_t)
00458 GET_SPARSE_VECTOR(get_shortreal_sparse_vector, atof, float32_t)
00459 GET_SPARSE_VECTOR(get_real_sparse_vector, atof, float64_t)
00460 GET_SPARSE_VECTOR(get_short_sparse_vector, atoi, int16_t)
00461 GET_SPARSE_VECTOR(get_word_sparse_vector, atoi, uint16_t)
00462 GET_SPARSE_VECTOR(get_int8_sparse_vector, atoi, int8_t)
00463 GET_SPARSE_VECTOR(get_uint_sparse_vector, atoi, uint32_t)
00464 GET_SPARSE_VECTOR(get_long_sparse_vector, atoi, int64_t)
00465 GET_SPARSE_VECTOR(get_ulong_sparse_vector, atoi, uint64_t)
00466 GET_SPARSE_VECTOR(get_longreal_sparse_vector, atoi, floatmax_t)
00467 #undef GET_SPARSE_VECTOR
00468
00469
00470
00471 #define GET_SPARSE_VECTOR_AND_LABEL(fname, conv, sg_type) \
00472 void CStreamingAsciiFile::get_sparse_vector_and_label(SGSparseVectorEntry<sg_type>*& vector, int32_t& len, float64_t& label) \
00473 { \
00474 char* buffer = NULL; \
00475 ssize_t bytes_read; \
00476 \
00477 bytes_read = buf->read_line(buffer); \
00478 \
00479 if (bytes_read<=1) \
00480 { \
00481 vector=NULL; \
00482 len=-1; \
00483 return; \
00484 } \
00485 \
00486 \
00487 int32_t num_chars; \
00488 if (buffer[bytes_read-1]=='\n') \
00489 { \
00490 num_chars=bytes_read-1; \
00491 buffer[num_chars]='\0'; \
00492 } \
00493 else \
00494 num_chars=bytes_read; \
00495 \
00496 int32_t num_dims=0; \
00497 for (int32_t i=0; i<num_chars; i++) \
00498 { \
00499 if (buffer[i]==':') \
00500 { \
00501 num_dims++; \
00502 } \
00503 } \
00504 \
00505 int32_t index_start_pos=-1; \
00506 int32_t feature_start_pos; \
00507 int32_t current_feat=0; \
00508 int32_t label_pos=-1; \
00509 vector=SG_MALLOC(SGSparseVectorEntry<sg_type>, num_dims); \
00510 \
00511 for (int32_t i=1; i<num_chars; i++) \
00512 { \
00513 if (buffer[i]==':') \
00514 { \
00515 break; \
00516 } \
00517 if ( (buffer[i]==' ') && (buffer[i-1]!=' ') ) \
00518 { \
00519 buffer[i]='\0'; \
00520 label_pos=i; \
00521 label=atof(buffer); \
00522 break; \
00523 } \
00524 } \
00525 \
00526 if (label_pos==-1) \
00527 SG_ERROR("No label found!\n"); \
00528 \
00529 buffer+=label_pos+1; \
00530 num_chars-=label_pos+1; \
00531 for (int32_t i=0; i<num_chars; i++) \
00532 { \
00533 if (buffer[i]==':') \
00534 { \
00535 buffer[i]='\0'; \
00536 vector[current_feat].feat_index=(int32_t) atoi(buffer+index_start_pos)-1; \
00537 \
00538 index_start_pos=-1; \
00539 \
00540 feature_start_pos=i+1; \
00541 while ((buffer[i]!=' ') && (i<num_chars)) \
00542 { \
00543 i++; \
00544 } \
00545 \
00546 buffer[i]='\0'; \
00547 vector[current_feat].entry=(sg_type) conv(buffer+feature_start_pos); \
00548 \
00549 current_feat++; \
00550 } \
00551 else if (buffer[i]==' ') \
00552 i++; \
00553 else \
00554 { \
00555 \
00556 \
00557 \
00558 if (index_start_pos == -1) \
00559 index_start_pos=i; \
00560 } \
00561 } \
00562 \
00563 len=current_feat; \
00564 }
00565
00566 GET_SPARSE_VECTOR_AND_LABEL(get_bool_sparse_vector_and_label, str_to_bool, bool)
00567 GET_SPARSE_VECTOR_AND_LABEL(get_byte_sparse_vector_and_label, atoi, uint8_t)
00568 GET_SPARSE_VECTOR_AND_LABEL(get_char_sparse_vector_and_label, atoi, char)
00569 GET_SPARSE_VECTOR_AND_LABEL(get_int_sparse_vector_and_label, atoi, int32_t)
00570 GET_SPARSE_VECTOR_AND_LABEL(get_shortreal_sparse_vector_and_label, atof, float32_t)
00571 GET_SPARSE_VECTOR_AND_LABEL(get_real_sparse_vector_and_label, atof, float64_t)
00572 GET_SPARSE_VECTOR_AND_LABEL(get_short_sparse_vector_and_label, atoi, int16_t)
00573 GET_SPARSE_VECTOR_AND_LABEL(get_word_sparse_vector_and_label, atoi, uint16_t)
00574 GET_SPARSE_VECTOR_AND_LABEL(get_int8_sparse_vector_and_label, atoi, int8_t)
00575 GET_SPARSE_VECTOR_AND_LABEL(get_uint_sparse_vector_and_label, atoi, uint32_t)
00576 GET_SPARSE_VECTOR_AND_LABEL(get_long_sparse_vector_and_label, atoi, int64_t)
00577 GET_SPARSE_VECTOR_AND_LABEL(get_ulong_sparse_vector_and_label, atoi, uint64_t)
00578 GET_SPARSE_VECTOR_AND_LABEL(get_longreal_sparse_vector_and_label, atoi, floatmax_t)
00579 #undef GET_SPARSE_VECTOR_AND_LABEL
00580
00581 template <class T>
00582 void CStreamingAsciiFile::append_item(
00583 DynArray<T>* items, char* ptr_data, char* ptr_item)
00584 {
00585 size_t len=(ptr_data-ptr_item)/sizeof(char);
00586 char* item=SG_MALLOC(char, len+1);
00587 memset(item, 0, sizeof(char)*(len+1));
00588 item=strncpy(item, ptr_item, len);
00589
00590 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00591 items->append_element(item);
00592 }