AsciiFile.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Parts of this code are copyright (c) 2009 Yahoo! Inc.
00008  * All rights reserved.  The copyrights embodied in the content of
00009  * this file are licensed under the BSD (revised) open source license.
00010  *
00011  * Written (W) 2010 Soeren Sonnenburg
00012  * Copyright (C) 2010 Berlin Institute of Technology
00013  */
00014 
00015 #include <shogun/features/SparseFeatures.h>
00016 #include <shogun/io/File.h>
00017 #include <shogun/io/AsciiFile.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <ctype.h>
00020 #include <stdio.h>
00021 
00022 using namespace shogun;
00023 
00024 CAsciiFile::CAsciiFile()
00025 {
00026     SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n");
00027 }
00028 
00029 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
00030 {
00031 }
00032 
00033 CAsciiFile::CAsciiFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00034 {
00035 }
00036 
00037 CAsciiFile::~CAsciiFile()
00038 {
00039 }
00040 
00041 #define GET_VECTOR(fname, mfname, sg_type) \
00042 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
00043 {                                                   \
00044     vec=NULL;                                       \
00045     len=0;                                          \
00046     int32_t num_feat=0;                             \
00047     int32_t num_vec=0;                              \
00048     mfname(vec, num_feat, num_vec);                 \
00049     if ((num_feat==1) || (num_vec==1))              \
00050     {                                               \
00051         if (num_feat==1)                            \
00052             len=num_vec;                            \
00053         else                                        \
00054             len=num_feat;                           \
00055     }                                               \
00056     else                                            \
00057     {                                               \
00058         SG_FREE(vec);                               \
00059         vec=NULL;                                   \
00060         len=0;                                      \
00061         SG_ERROR("Could not read vector from"       \
00062                 " file %s (shape %dx%d found but "  \
00063                 "vector expected).\n", filename,    \
00064                 num_vec, num_feat);                 \
00065     }                                               \
00066 }
00067 
00068 GET_VECTOR(get_vector, get_matrix, uint8_t)
00069 GET_VECTOR(get_vector, get_matrix, char)
00070 GET_VECTOR(get_vector, get_matrix, int32_t)
00071 GET_VECTOR(get_vector, get_matrix, float32_t)
00072 GET_VECTOR(get_vector, get_matrix, float64_t)
00073 GET_VECTOR(get_vector, get_matrix, int16_t)
00074 GET_VECTOR(get_vector, get_matrix, uint16_t)
00075 #undef GET_VECTOR
00076 
00077 #define GET_MATRIX(fname, conv, sg_type)                                        \
00078 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec)   \
00079 {                                                                               \
00080     struct stat stats;                                                          \
00081     if (stat(filename, &stats)!=0)                                              \
00082         SG_ERROR("Could not get file statistics.\n");                           \
00083                                                                                 \
00084     char* data=SG_MALLOC(char, stats.st_size+1);                                        \
00085     memset(data, 0, sizeof(char)*(stats.st_size+1));                            \
00086     size_t nread=fread(data, sizeof(char), stats.st_size, file);                \
00087     if (nread<=0)                                                               \
00088         SG_ERROR("Could not read data from %s.\n", filename);                   \
00089                                                                                 \
00090     SG_DEBUG("data read from file:\n%s\n", data);                               \
00091                                                                                 \
00092     /* determine num_feat and num_vec, populate dynamic array */                \
00093     int32_t nf=0;                                                               \
00094     num_feat=0;                                                                 \
00095     num_vec=0;                                                                  \
00096     char* ptr_item=NULL;                                                        \
00097     char* ptr_data=data;                                                        \
00098     DynArray<char*>* items=new DynArray<char*>();                       \
00099                                                                                 \
00100     while (*ptr_data)                                                           \
00101     {                                                                           \
00102         if (*ptr_data=='\n')                                                    \
00103         {                                                                       \
00104             if (ptr_item)                                                       \
00105                 nf++;                                                           \
00106                                                                                 \
00107             if (num_feat!=0 && nf!=num_feat)                                    \
00108                 SG_ERROR("Number of features mismatches (%d != %d) in vector"   \
00109                         " %d in file %s.\n", num_feat, nf, num_vec, filename);  \
00110                                                                                 \
00111             append_item(items, ptr_data, ptr_item);                             \
00112             num_feat=nf;                                                        \
00113             num_vec++;                                                          \
00114             nf=0;                                                               \
00115             ptr_item=NULL;                                                      \
00116         }                                                                       \
00117         else if (!isblank(*ptr_data) && !ptr_item)                              \
00118         {                                                                       \
00119             ptr_item=ptr_data;                                                  \
00120         }                                                                       \
00121         else if (isblank(*ptr_data) && ptr_item)                                \
00122         {                                                                       \
00123             append_item(items, ptr_data, ptr_item);                             \
00124             ptr_item=NULL;                                                      \
00125             nf++;                                                               \
00126         }                                                                       \
00127                                                                                 \
00128         ptr_data++;                                                             \
00129     }                                                                           \
00130                                                                                 \
00131     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);                  \
00132     SG_FREE(data);                                                              \
00133                                                                                 \
00134     /* now copy data into matrix */                                             \
00135     matrix=SG_MALLOC(sg_type, num_vec*num_feat);                                        \
00136     for (int32_t i=0; i<num_vec; i++)                                           \
00137     {                                                                           \
00138         for (int32_t j=0; j<num_feat; j++)                                      \
00139         {                                                                       \
00140             char* item=items->get_element(i*num_feat+j);                        \
00141             matrix[i*num_feat+j]=conv(item);                                    \
00142             SG_FREE(item);                                                      \
00143         }                                                                       \
00144     }                                                                           \
00145     delete items;                                                               \
00146 }
00147 
00148 GET_MATRIX(get_matrix, atoi, uint8_t)
00149 GET_MATRIX(get_int8_matrix, atoi, int8_t)
00150 GET_MATRIX(get_matrix, atoi, char)
00151 GET_MATRIX(get_matrix, atoi, int32_t)
00152 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
00153 GET_MATRIX(get_long_matrix, atoll, int64_t)
00154 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
00155 GET_MATRIX(get_matrix, atof, float32_t)
00156 GET_MATRIX(get_matrix, atof, float64_t)
00157 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
00158 GET_MATRIX(get_matrix, atoi, int16_t)
00159 GET_MATRIX(get_matrix, atoi, uint16_t)
00160 #undef GET_MATRIX
00161 
00162 #define GET_NDARRAY(fname, conv, sg_type)                           \
00163 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims)            \
00164 {                                               \
00165     struct stat stats;                                  \
00166     if (stat(filename, &stats)!=0)                              \
00167         SG_ERROR("Could not get file statistics.\n");                   \
00168                                                 \
00169     char* data=SG_MALLOC(char, stats.st_size+1);                            \
00170     memset(data, 0, sizeof(char)*(stats.st_size+1));                    \
00171     size_t nread=fread(data, sizeof(char), stats.st_size, file);                \
00172     if (nread<=0)                                       \
00173         SG_ERROR("Could not read data from %s.\n", filename);               \
00174                                                 \
00175     SG_DEBUG("data read from file:\n%s\n", data);                       \
00176                                                 \
00177     /* determine size of array */                               \
00178     int32_t length=0;                                   \
00179     int32_t counter=0;                                                                  \
00180     size_t total=0;                                                         \
00181         num_dims = -1;                                                  \
00182     char* ptr_item=NULL;                                    \
00183     char* ptr_data=data;                                    \
00184     DynArray<char*>* items=new DynArray<char*>();                       \
00185                                                                                                 \
00186         /* read line with sizes of array*/                                          \
00187         while(*ptr_data != '\n')                                                        \
00188         {                                                                                       \
00189             if(isblank(*ptr_data) && ptr_item)                                      \
00190             {                                                                                   \
00191                 append_item(items, ptr_data, ptr_item);                         \
00192                 num_dims++;                                                                 \
00193                 ptr_item = NULL;                                                            \
00194             }                                                                                   \
00195             else if(!isblank(*ptr_data) && !ptr_item)                               \
00196                 ptr_item = ptr_data;                                                            \
00197                                                                                                 \
00198             ptr_data++;                                                                         \
00199         }                                                                                       \
00200         ptr_item = NULL;                                                                        \
00201         ptr_data++;                                                                             \
00202                                                     \
00203     /* read array data*/                                                                    \
00204     while(*ptr_data)                                    \
00205     {                                           \
00206         if (*ptr_data=='\n')                                \
00207         {                                       \
00208             if (ptr_item)                               \
00209                 counter++;                          \
00210                                                 \
00211             if (length!=0 && counter!=length)                   \
00212                 SG_ERROR("Invalid number of data (%d != %d) in line"        \
00213                 " %d in file %s.\n", length, counter, total, filename);     \
00214                                                 \
00215             append_item(items, ptr_data, ptr_item);                 \
00216             length=counter;                             \
00217             total++;                                \
00218             counter=0;                              \
00219             ptr_item=NULL;                              \
00220         }                                       \
00221         else if (!isblank(*ptr_data) && !ptr_item)                  \
00222         {                                       \
00223             ptr_item=ptr_data;                          \
00224         }                                       \
00225         else if (isblank(*ptr_data) && ptr_item)                    \
00226         {                                       \
00227             append_item(items, ptr_data, ptr_item);                 \
00228             ptr_item=NULL;                              \
00229             counter++;                              \
00230         }                                       \
00231                                                 \
00232         ptr_data++;                                 \
00233     }                                           \
00234                                                 \
00235     SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total);         \
00236     SG_FREE(data);                                      \
00237                                                 \
00238     /* determining sizes of dimensions*/                                                    \
00239         char * item;                                                                            \
00240         item=items->get_element(0);                                                             \
00241         if(atoi(item) != num_dims)                                                              \
00242             SG_ERROR("Invalid number of dimensions!\n");                                    \
00243         SG_FREE(item);                                                                          \
00244         dims = SG_MALLOC(int32_t, num_dims);                                                           \
00245         for(int32_t i =0;i < num_dims;i++)                                                  \
00246         {                                                                                       \
00247             item = items->get_element(i+1);                                             \
00248             dims[i] = atoi(item);                                                               \
00249             SG_FREE(item);                                                                      \
00250         }                                                                                       \
00251         if (dims[num_dims-1] != length)                                                     \
00252             SG_ERROR("Invalid number of lines in file!\n");                             \
00253                                                                                             \
00254         /* converting array data */                             \
00255         total *= length;                                    \
00256     array=SG_MALLOC(sg_type, total);                                    \
00257     for (size_t i=0; i<total; i++)                              \
00258     {                                           \
00259             item=items->get_element(i+(num_dims+1));                \
00260             array[i]=conv(item);                            \
00261             SG_FREE(item);                              \
00262     }                                           \
00263     delete items;                                       \
00264 }
00265 
00266 GET_NDARRAY(get_ndarray, atoi, uint8_t)
00267 GET_NDARRAY(get_int8_ndarray, atoi, int8_t)
00268 GET_NDARRAY(get_ndarray, atoi, char)
00269 GET_NDARRAY(get_ndarray, atoi, int32_t)
00270 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t)
00271 GET_NDARRAY(get_long_ndarray, atoll, int64_t)
00272 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t)
00273 GET_NDARRAY(get_ndarray, atof, float32_t)
00274 GET_NDARRAY(get_ndarray, atof, float64_t)
00275 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t)
00276 GET_NDARRAY(get_ndarray, atoi, int16_t)
00277 GET_NDARRAY(get_ndarray, atoi, uint16_t)
00278 #undef GET_NDARRAY
00279 
00280 #define GET_SPARSEMATRIX(fname, conv, sg_type)                                      \
00281 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec)   \
00282 {   \
00283     size_t blocksize=1024*1024; \
00284     size_t required_blocksize=blocksize;    \
00285     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);   \
00286     \
00287     if (file)   \
00288     {   \
00289         num_vec=0;  \
00290         num_feat=0; \
00291     \
00292         SG_INFO("counting line numbers in file %s\n", filename);    \
00293         size_t sz=blocksize;    \
00294         size_t block_offs=0;    \
00295         size_t old_block_offs=0;    \
00296         fseek(file, 0, SEEK_END);   \
00297         size_t fsize=ftell(file);   \
00298         rewind(file);   \
00299     \
00300         while (sz == blocksize) \
00301         {   \
00302             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00303             for (size_t i=0; i<sz; i++) \
00304             {   \
00305                 block_offs++;   \
00306                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00307                 {   \
00308                     num_vec++;  \
00309                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
00310                     old_block_offs=block_offs;  \
00311                 }   \
00312             }   \
00313             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");    \
00314         }   \
00315     \
00316         SG_INFO("found %d feature vectors\n", num_vec); \
00317         SG_FREE(dummy); \
00318         blocksize=required_blocksize;   \
00319         dummy = SG_MALLOC(uint8_t, blocksize+1); /*allow setting of '\0' at EOL*/   \
00320         matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
00321     \
00322         rewind(file);   \
00323         sz=blocksize;   \
00324         int32_t lines=0;    \
00325         while (sz == blocksize) \
00326         {   \
00327             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00328     \
00329             size_t old_sz=0;    \
00330             for (size_t i=0; i<sz; i++) \
00331             {   \
00332                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
00333                 {   \
00334                     size_t len=i-old_sz+1;  \
00335                     uint8_t* data=&dummy[old_sz];   \
00336     \
00337                     for (size_t j=0; j<len; j++)    \
00338                         dummy[j]=data[j];   \
00339     \
00340                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);  \
00341                     i=0;    \
00342                     old_sz=0;   \
00343                     sz+=len;    \
00344                 }   \
00345     \
00346                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00347                 {   \
00348     \
00349                     size_t len=i-old_sz;    \
00350                     uint8_t* data=&dummy[old_sz];   \
00351     \
00352                     int32_t dims=0; \
00353                     for (size_t j=0; j<len; j++)    \
00354                     {   \
00355                         if (data[j]==':')   \
00356                             dims++; \
00357                     }   \
00358     \
00359                     if (dims<=0)    \
00360                     {   \
00361                         SG_ERROR("Error in line %d - number of" \
00362                                 " dimensions is %d line is %d characters"   \
00363                                 " long\n line_content:'%.*s'\n", lines, \
00364                                 dims, len, len, (const char*) data);    \
00365                     }   \
00366     \
00367                     SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims);   \
00368     \
00369                     /* skip label part */   \
00370                     size_t j=0; \
00371                     for (; j<len; j++)  \
00372                     {   \
00373                         if (data[j]==':')   \
00374                         {   \
00375                             j=-1; /* file without label*/   \
00376                             break;  \
00377                         }   \
00378     \
00379                         if (data[j]==' ')   \
00380                         {   \
00381                             data[j]='\0';   \
00382     \
00383                             /* skip label part */   \
00384                             break;  \
00385                         }   \
00386                     }   \
00387     \
00388                     int32_t d=0;    \
00389                     j++;    \
00390                     uint8_t* start=&data[j];    \
00391                     for (; j<len; j++)  \
00392                     {   \
00393                         if (data[j]==':')   \
00394                         {   \
00395                             data[j]='\0';   \
00396     \
00397                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;   \
00398                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);    \
00399     \
00400                             j++;    \
00401                             start=&data[j]; \
00402                             for (; j<len; j++)  \
00403                             {   \
00404                                 if (data[j]==' ' || data[j]=='\n')  \
00405                                 {   \
00406                                     data[j]='\0';   \
00407                                     feat[d].entry=(sg_type) conv((const char*) start);  \
00408                                     d++;    \
00409                                     break;  \
00410                                 }   \
00411                             }   \
00412     \
00413                             if (j==len) \
00414                             {   \
00415                                 data[j]='\0';   \
00416                                 feat[dims-1].entry=(sg_type) conv((const char*) start); \
00417                             }   \
00418     \
00419                             j++;    \
00420                             start=&data[j]; \
00421                         }   \
00422                     }   \
00423     \
00424                     matrix[lines].vec_index=lines;  \
00425                     matrix[lines].num_feat_entries=dims;    \
00426                     matrix[lines].features=feat;    \
00427     \
00428                     old_sz=i+1; \
00429                     lines++;    \
00430                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");    \
00431                 }   \
00432             }   \
00433         }   \
00434     \
00435         SG_INFO("file successfully read\n");    \
00436     }   \
00437     \
00438     SG_FREE(dummy); \
00439 }
00440 
00441 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool)
00442 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t)
00443 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
00444 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char)
00445 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t)
00446 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
00447 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
00448 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
00449 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t)
00450 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t)
00451 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
00452 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t)
00453 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t)
00454 #undef GET_SPARSEMATRIX
00455 
00456 
00457 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00458 {
00459     size_t blocksize=1024*1024;
00460     size_t required_blocksize=0;
00461     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00462     uint8_t* overflow=NULL;
00463     int32_t overflow_len=0;
00464 
00465     if (file)
00466     {
00467         num_str=0;
00468         max_string_len=0;
00469 
00470         SG_INFO("counting line numbers in file %s\n", filename);
00471         size_t sz=blocksize;
00472         size_t block_offs=0;
00473         size_t old_block_offs=0;
00474         fseek(file, 0, SEEK_END);
00475         size_t fsize=ftell(file);
00476         rewind(file);
00477 
00478         while (sz == blocksize)
00479         {
00480             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00481             for (size_t i=0; i<sz; i++)
00482             {
00483                 block_offs++;
00484                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00485                 {
00486                     num_str++;
00487                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00488                     old_block_offs=block_offs;
00489                 }
00490             }
00491             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00492         }
00493 
00494         SG_INFO("found %d strings\n", num_str);
00495         SG_DEBUG("block_size=%d\n", required_blocksize);
00496         SG_FREE(dummy);
00497         blocksize=required_blocksize;
00498         dummy=SG_MALLOC(uint8_t, blocksize);
00499         overflow=SG_MALLOC(uint8_t, blocksize);
00500         strings=SG_MALLOC(SGString<uint8_t>, num_str);
00501 
00502         rewind(file);
00503         sz=blocksize;
00504         int32_t lines=0;
00505         size_t old_sz=0;
00506         while (sz == blocksize)
00507         {
00508             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00509 
00510             old_sz=0;
00511             for (size_t i=0; i<sz; i++)
00512             {
00513                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00514                 {
00515                     int32_t len=i-old_sz;
00516                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00517 
00518                     strings[lines].slen=len+overflow_len;
00519                     strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len);
00520 
00521                     for (int32_t j=0; j<overflow_len; j++)
00522                         strings[lines].string[j]=overflow[j];
00523                     for (int32_t j=0; j<len; j++)
00524                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00525 
00526                     // clear overflow
00527                     overflow_len=0;
00528 
00529                     //CMath::display_vector(strings[lines].string, len);
00530                     old_sz=i+1;
00531                     lines++;
00532                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00533                 }
00534             }
00535 
00536             for (size_t i=old_sz; i<sz; i++)
00537                 overflow[i-old_sz]=dummy[i];
00538 
00539             overflow_len=sz-old_sz;
00540         }
00541         SG_INFO("file successfully read\n");
00542         SG_INFO("max_string_length=%d\n", max_string_len);
00543         SG_INFO("num_strings=%d\n", num_str);
00544     }
00545 
00546     SG_FREE(dummy);
00547     SG_FREE(overflow);
00548 }
00549 
00550 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00551 {
00552     size_t blocksize=1024*1024;
00553     size_t required_blocksize=0;
00554     int8_t* dummy=SG_MALLOC(int8_t, blocksize);
00555     int8_t* overflow=NULL;
00556     int32_t overflow_len=0;
00557 
00558     if (file)
00559     {
00560         num_str=0;
00561         max_string_len=0;
00562 
00563         SG_INFO("counting line numbers in file %s\n", filename);
00564         size_t sz=blocksize;
00565         size_t block_offs=0;
00566         size_t old_block_offs=0;
00567         fseek(file, 0, SEEK_END);
00568         size_t fsize=ftell(file);
00569         rewind(file);
00570 
00571         while (sz == blocksize)
00572         {
00573             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00574             for (size_t i=0; i<sz; i++)
00575             {
00576                 block_offs++;
00577                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00578                 {
00579                     num_str++;
00580                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00581                     old_block_offs=block_offs;
00582                 }
00583             }
00584             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00585         }
00586 
00587         SG_INFO("found %d strings\n", num_str);
00588         SG_DEBUG("block_size=%d\n", required_blocksize);
00589         SG_FREE(dummy);
00590         blocksize=required_blocksize;
00591         dummy=SG_MALLOC(int8_t, blocksize);
00592         overflow=SG_MALLOC(int8_t, blocksize);
00593         strings=SG_MALLOC(SGString<int8_t>, num_str);
00594 
00595         rewind(file);
00596         sz=blocksize;
00597         int32_t lines=0;
00598         size_t old_sz=0;
00599         while (sz == blocksize)
00600         {
00601             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00602 
00603             old_sz=0;
00604             for (size_t i=0; i<sz; i++)
00605             {
00606                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00607                 {
00608                     int32_t len=i-old_sz;
00609                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00610 
00611                     strings[lines].slen=len+overflow_len;
00612                     strings[lines].string=SG_MALLOC(int8_t, len+overflow_len);
00613 
00614                     for (int32_t j=0; j<overflow_len; j++)
00615                         strings[lines].string[j]=overflow[j];
00616                     for (int32_t j=0; j<len; j++)
00617                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00618 
00619                     // clear overflow
00620                     overflow_len=0;
00621 
00622                     //CMath::display_vector(strings[lines].string, len);
00623                     old_sz=i+1;
00624                     lines++;
00625                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00626                 }
00627             }
00628 
00629             for (size_t i=old_sz; i<sz; i++)
00630                 overflow[i-old_sz]=dummy[i];
00631 
00632             overflow_len=sz-old_sz;
00633         }
00634         SG_INFO("file successfully read\n");
00635         SG_INFO("max_string_length=%d\n", max_string_len);
00636         SG_INFO("num_strings=%d\n", num_str);
00637     }
00638 
00639     SG_FREE(dummy);
00640     SG_FREE(overflow);
00641 }
00642 
00643 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00644 {
00645     size_t blocksize=1024*1024;
00646     size_t required_blocksize=0;
00647     char* dummy=SG_MALLOC(char, blocksize);
00648     char* overflow=NULL;
00649     int32_t overflow_len=0;
00650 
00651     if (file)
00652     {
00653         num_str=0;
00654         max_string_len=0;
00655 
00656         SG_INFO("counting line numbers in file %s\n", filename);
00657         size_t sz=blocksize;
00658         size_t block_offs=0;
00659         size_t old_block_offs=0;
00660         fseek(file, 0, SEEK_END);
00661         size_t fsize=ftell(file);
00662         rewind(file);
00663 
00664         while (sz == blocksize)
00665         {
00666             sz=fread(dummy, sizeof(char), blocksize, file);
00667             for (size_t i=0; i<sz; i++)
00668             {
00669                 block_offs++;
00670                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00671                 {
00672                     num_str++;
00673                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00674                     old_block_offs=block_offs;
00675                 }
00676             }
00677             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00678         }
00679 
00680         SG_INFO("found %d strings\n", num_str);
00681         SG_DEBUG("block_size=%d\n", required_blocksize);
00682         SG_FREE(dummy);
00683         blocksize=required_blocksize;
00684         dummy=SG_MALLOC(char, blocksize);
00685         overflow=SG_MALLOC(char, blocksize);
00686         strings=SG_MALLOC(SGString<char>, num_str);
00687 
00688         rewind(file);
00689         sz=blocksize;
00690         int32_t lines=0;
00691         size_t old_sz=0;
00692         while (sz == blocksize)
00693         {
00694             sz=fread(dummy, sizeof(char), blocksize, file);
00695 
00696             old_sz=0;
00697             for (size_t i=0; i<sz; i++)
00698             {
00699                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00700                 {
00701                     int32_t len=i-old_sz;
00702                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00703 
00704                     strings[lines].slen=len+overflow_len;
00705                     strings[lines].string=SG_MALLOC(char, len+overflow_len);
00706 
00707                     for (int32_t j=0; j<overflow_len; j++)
00708                         strings[lines].string[j]=overflow[j];
00709                     for (int32_t j=0; j<len; j++)
00710                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00711 
00712                     // clear overflow
00713                     overflow_len=0;
00714 
00715                     //CMath::display_vector(strings[lines].string, len);
00716                     old_sz=i+1;
00717                     lines++;
00718                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00719                 }
00720             }
00721 
00722             for (size_t i=old_sz; i<sz; i++)
00723                 overflow[i-old_sz]=dummy[i];
00724 
00725             overflow_len=sz-old_sz;
00726         }
00727         SG_INFO("file successfully read\n");
00728         SG_INFO("max_string_length=%d\n", max_string_len);
00729         SG_INFO("num_strings=%d\n", num_str);
00730     }
00731 
00732     SG_FREE(dummy);
00733     SG_FREE(overflow);
00734 }
00735 
00736 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00737 {
00738     strings=NULL;
00739     num_str=0;
00740     max_string_len=0;
00741 }
00742 
00743 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00744 {
00745     strings=NULL;
00746     num_str=0;
00747     max_string_len=0;
00748 }
00749 
00750 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00751 {
00752     strings=NULL;
00753     num_str=0;
00754     max_string_len=0;
00755 }
00756 
00757 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00758 {
00759     strings=NULL;
00760     num_str=0;
00761     max_string_len=0;
00762 }
00763 
00764 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00765 {
00766     strings=NULL;
00767     num_str=0;
00768     max_string_len=0;
00769 }
00770 
00771 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00772 {
00773     strings=NULL;
00774     num_str=0;
00775     max_string_len=0;
00776 }
00777 
00778 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00779 {
00780     strings=NULL;
00781     num_str=0;
00782     max_string_len=0;
00783 }
00784 
00785 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00786 {
00787     strings=NULL;
00788     num_str=0;
00789     max_string_len=0;
00790 }
00791 
00792 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00793 {
00794     strings=NULL;
00795     num_str=0;
00796     max_string_len=0;
00797 }
00798 
00799 
00802 #define SET_VECTOR(fname, mfname, sg_type)  \
00803 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
00804 {                                                           \
00805     mfname(vec, len, 1);                                    \
00806 }
00807 SET_VECTOR(set_vector, set_matrix, uint8_t)
00808 SET_VECTOR(set_vector, set_matrix, char)
00809 SET_VECTOR(set_vector, set_matrix, int32_t)
00810 SET_VECTOR(set_vector, set_matrix, float32_t)
00811 SET_VECTOR(set_vector, set_matrix, float64_t)
00812 SET_VECTOR(set_vector, set_matrix, int16_t)
00813 SET_VECTOR(set_vector, set_matrix, uint16_t)
00814 #undef SET_VECTOR
00815 
00816 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
00817 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec)    \
00818 {                                                                                   \
00819     if (!(file && matrix))                                                          \
00820         SG_ERROR("File or matrix invalid.\n");                                      \
00821                                                                                     \
00822     for (int32_t i=0; i<num_vec; i++)                                               \
00823     {                                                                               \
00824         for (int32_t j=0; j<num_feat; j++)                                          \
00825         {                                                                           \
00826             sg_type v=matrix[num_feat*i+j];                                         \
00827             if (j==num_feat-1)                                                      \
00828                 fprintf(file, type_str "\n", (fprt_type) v);                        \
00829             else                                                                    \
00830                 fprintf(file, type_str " ", (fprt_type) v);                         \
00831         }                                                                           \
00832     }                                                                               \
00833 }
00834 SET_MATRIX(set_matrix, char, char, "%c")
00835 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
00836 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
00837 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
00838 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
00839 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
00840 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
00841 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
00842 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
00843 SET_MATRIX(set_matrix, float32_t, float32_t, "%f")
00844 SET_MATRIX(set_matrix, float64_t, float64_t, "%f")
00845 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%Lf")
00846 #undef SET_MATRIX
00847 
00848 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
00849 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims)  \
00850 {                                       \
00851     if (!(file && array))                           \
00852         SG_ERROR("File or data invalid.\n");                \
00853                                         \
00854         size_t total = 1;                               \
00855         for(int i = 0;i < num_dims;i++)                         \
00856             total *= dims[i];                                           \
00857         int32_t block_size = dims[num_dims-1];                                  \
00858                                                                         \
00859         fprintf(file,"%d ",num_dims);                       \
00860         for(int i = 0;i < num_dims;i++)                     \
00861             fprintf(file,"%d ",dims[i]);                        \
00862         fprintf(file,"\n");                             \
00863                                                                                 \
00864         for (size_t i=0; i < total; i++)                    \
00865     {                                   \
00866         sg_type v= array[i];                        \
00867         if ( ((i+1) % block_size) == 0)                 \
00868             fprintf(file, type_str "\n", (fprt_type) v);        \
00869         else                                \
00870             fprintf(file, type_str " ", (fprt_type) v);     \
00871     }                                   \
00872 }
00873 
00874 SET_NDARRAY(set_ndarray, char, char, "%c")
00875 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u")
00876 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
00877 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i")
00878 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
00879 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli")
00880 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu")
00881 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i")
00882 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u")
00883 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
00884 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
00885 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
00886 #undef SET_NDARRAY
00887 
00888 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
00889 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec)    \
00890 {                                                                                           \
00891     if (!(file && matrix))                                                                  \
00892         SG_ERROR("File or matrix invalid.\n");                                              \
00893                                                                                             \
00894     for (int32_t i=0; i<num_vec; i++)                                                       \
00895     {                                                                                       \
00896         SGSparseVectorEntry<sg_type>* vec = matrix[i].features;                                 \
00897         int32_t len=matrix[i].num_feat_entries;                                             \
00898                                                                                             \
00899         for (int32_t j=0; j<len; j++)                                                       \
00900         {                                                                                   \
00901             if (j<len-1)                                                                    \
00902             {                                                                               \
00903                 fprintf(file, "%d:" type_str " ",                                           \
00904                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00905             }                                                                               \
00906             else                                                                            \
00907             {                                                                               \
00908                 fprintf(file, "%d:" type_str "\n",                                          \
00909                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00910             }                                                                               \
00911         }                                                                                   \
00912     }                                                                                       \
00913 }
00914 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u")
00915 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c")
00916 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u")
00917 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
00918 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i")
00919 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
00920 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
00921 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
00922 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i")
00923 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u")
00924 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f")
00925 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f")
00926 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
00927 #undef SET_SPARSEMATRIX
00928 
00929 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str)
00930 {
00931     if (!(file && strings))
00932         SG_ERROR("File or strings invalid.\n");
00933 
00934     for (int32_t i=0; i<num_str; i++)
00935     {
00936         int32_t len = strings[i].slen;
00937         fwrite(strings[i].string, sizeof(uint8_t), len, file);
00938         fprintf(file, "\n");
00939     }
00940 }
00941 
00942 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str)
00943 {
00944     if (!(file && strings))
00945         SG_ERROR("File or strings invalid.\n");
00946 
00947     for (int32_t i=0; i<num_str; i++)
00948     {
00949         int32_t len = strings[i].slen;
00950         fwrite(strings[i].string, sizeof(int8_t), len, file);
00951         fprintf(file, "\n");
00952     }
00953 }
00954 
00955 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str)
00956 {
00957     if (!(file && strings))
00958         SG_ERROR("File or strings invalid.\n");
00959 
00960     for (int32_t i=0; i<num_str; i++)
00961     {
00962         int32_t len = strings[i].slen;
00963         fwrite(strings[i].string, sizeof(char), len, file);
00964         fprintf(file, "\n");
00965     }
00966 }
00967 
00968 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str)
00969 {
00970 }
00971 
00972 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str)
00973 {
00974 }
00975 
00976 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str)
00977 {
00978 }
00979 
00980 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str)
00981 {
00982 }
00983 
00984 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str)
00985 {
00986 }
00987 
00988 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str)
00989 {
00990 }
00991 
00992 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str)
00993 {
00994 }
00995 
00996 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str)
00997 {
00998 }
00999 
01000 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str)
01001 {
01002 }
01003 
01004 template <class T> void CAsciiFile::append_item(
01005     DynArray<T>* items, char* ptr_data, char* ptr_item)
01006 {
01007     size_t len=(ptr_data-ptr_item)/sizeof(char);
01008     char* item=SG_MALLOC(char, len+1);
01009     memset(item, 0, sizeof(char)*(len+1));
01010     item=strncpy(item, ptr_item, len);
01011 
01012     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
01013     items->append_element(item);
01014 }
01015 
01016 #ifdef __MACH__
01017 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01018 {
01019     int32_t total_bytes_read=0;
01020     int32_t default_size=10;
01021                 
01022     if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
01023         return -1;
01024         
01025     if ((*lineptr == NULL) && (*n == 0))
01026     {
01027         *lineptr=SG_MALLOC(char, default_size);
01028         *n=default_size;
01029     }
01030         
01031     int32_t bytes_read, pos=-1;
01032     int32_t threshold_size=100000;
01033 
01034     while (1)
01035     {
01036         // We need some limit in case file does not contain '\n'
01037         if (*n > threshold_size)
01038             return -1;
01039                 
01040         // Read from file and append to buffer
01041         bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);
01042 
01043         for (int i=0; i<bytes_read; i++)
01044         {
01045             if ((*lineptr)[total_bytes_read+i] == delimiter)
01046             {
01047                 pos=i;
01048                 break;
01049             }
01050         }
01051 
01052         if (pos==-1)
01053         {
01054             if (feof(stream))
01055                 return -1;
01056             total_bytes_read+=bytes_read;
01057             *lineptr=SG_REALLOC(char, *lineptr, (*n)*2);
01058             *n=(*n)*2;
01059             // A better reallocated size should be used
01060         }
01061         else
01062         {
01063             total_bytes_read+=pos+1;
01064             (*lineptr)[total_bytes_read]='\0';
01065             // Seek back to position after \n
01066             fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
01067             return total_bytes_read;
01068         }
01069     }
01070 }
01071 
01072 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01073 {
01074     return getdelim(lineptr, n, '\n', stream);
01075 }
01076 
01077 #else
01078 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01079 {
01080     return ::getdelim(lineptr, n, delimiter, stream);
01081 }
01082 
01083 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01084 {
01085     return ::getline(lineptr, n, stream);
01086 }
01087 #endif
01088 
01089 void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
01090 {
01091     ret.erase();
01092     char *last = s.start;
01093     for (; s.start != s.end; s.start++)
01094     {
01095         if (*s.start == delim)
01096         {
01097             if (s.start != last)
01098             {
01099                 substring temp = {last,s.start};
01100                 ret.push(temp);
01101             }
01102             last = s.start+1;
01103         }
01104     }
01105     if (s.start != last)
01106     {
01107         substring final = {last, s.start};
01108         ret.push(final);
01109     }
01110 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation