AsciiFile.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Parts of this code are copyright (c) 2009 Yahoo! Inc.
00008  * All rights reserved.  The copyrights embodied in the content of
00009  * this file are licensed under the BSD (revised) open source license.
00010  *
00011  * Written (W) 2010 Soeren Sonnenburg
00012  * Copyright (C) 2010 Berlin Institute of Technology
00013  */
00014 
00015 #include <shogun/features/SparseFeatures.h>
00016 #include <shogun/io/File.h>
00017 #include <shogun/io/AsciiFile.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <ctype.h>
00020 #include <stdio.h>
00021 
00022 using namespace shogun;
00023 
00024 CAsciiFile::CAsciiFile()
00025 {
00026     SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n");
00027 }
00028 
00029 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
00030 {
00031 }
00032 
00033 CAsciiFile::CAsciiFile(const char* fname, char rw, const char* name) : CFile(fname, rw, name)
00034 {
00035 }
00036 
00037 CAsciiFile::~CAsciiFile()
00038 {
00039 }
00040 
00041 #define GET_VECTOR(fname, mfname, sg_type) \
00042 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
00043 {                                                   \
00044     vec=NULL;                                       \
00045     len=0;                                          \
00046     int32_t num_feat=0;                             \
00047     int32_t num_vec=0;                              \
00048     mfname(vec, num_feat, num_vec);                 \
00049     if ((num_feat==1) || (num_vec==1))              \
00050     {                                               \
00051         if (num_feat==1)                            \
00052             len=num_vec;                            \
00053         else                                        \
00054             len=num_feat;                           \
00055     }                                               \
00056     else                                            \
00057     {                                               \
00058         SG_FREE(vec);                               \
00059         vec=NULL;                                   \
00060         len=0;                                      \
00061         SG_ERROR("Could not read vector from"       \
00062                 " file %s (shape %dx%d found but "  \
00063                 "vector expected).\n", filename,    \
00064                 num_vec, num_feat);                 \
00065     }                                               \
00066 }
00067 
00068 GET_VECTOR(get_vector, get_int8_matrix, int8_t)
00069 GET_VECTOR(get_vector, get_matrix, uint8_t)
00070 GET_VECTOR(get_vector, get_matrix, char)
00071 GET_VECTOR(get_vector, get_matrix, int32_t)
00072 GET_VECTOR(get_vector, get_uint_matrix, uint32_t)
00073 GET_VECTOR(get_vector, get_matrix, float32_t)
00074 GET_VECTOR(get_vector, get_matrix, float64_t)
00075 GET_VECTOR(get_vector, get_longreal_matrix, floatmax_t)
00076 GET_VECTOR(get_vector, get_matrix, int16_t)
00077 GET_VECTOR(get_vector, get_matrix, uint16_t)
00078 GET_VECTOR(get_vector, get_long_matrix, int64_t)
00079 GET_VECTOR(get_vector, get_ulong_matrix, uint64_t)
00080 #undef GET_VECTOR
00081 
00082 #define GET_MATRIX(fname, conv, sg_type)                                        \
00083 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec)   \
00084 {                                                                               \
00085     struct stat stats;                                                          \
00086     if (stat(filename, &stats)!=0)                                              \
00087         SG_ERROR("Could not get file statistics.\n");                           \
00088                                                                                 \
00089     char* data=SG_MALLOC(char, stats.st_size+1);                                        \
00090     memset(data, 0, sizeof(char)*(stats.st_size+1));                            \
00091     size_t nread=fread(data, sizeof(char), stats.st_size, file);                \
00092     if (nread<=0)                                                               \
00093         SG_ERROR("Could not read data from %s.\n", filename);                   \
00094                                                                                 \
00095     SG_DEBUG("data read from file:\n%s\n", data);                               \
00096                                                                                 \
00097     /* determine num_feat and num_vec, populate dynamic array */                \
00098     int32_t nf=0;                                                               \
00099     num_feat=0;                                                                 \
00100     num_vec=0;                                                                  \
00101     char* ptr_item=NULL;                                                        \
00102     char* ptr_data=data;                                                        \
00103     DynArray<char*>* items=new DynArray<char*>();                       \
00104                                                                                 \
00105     while (*ptr_data)                                                           \
00106     {                                                                           \
00107         if (*ptr_data=='\n')                                                    \
00108         {                                                                       \
00109             if (ptr_item)                                                       \
00110                 nf++;                                                           \
00111                                                                                 \
00112             if (num_feat!=0 && nf!=num_feat)                                    \
00113                 SG_ERROR("Number of features mismatches (%d != %d) in vector"   \
00114                         " %d in file %s.\n", num_feat, nf, num_vec, filename);  \
00115                                                                                 \
00116             append_item(items, ptr_data, ptr_item);                             \
00117             num_feat=nf;                                                        \
00118             num_vec++;                                                          \
00119             nf=0;                                                               \
00120             ptr_item=NULL;                                                      \
00121         }                                                                       \
00122         else if (!isblank(*ptr_data) && !ptr_item)                              \
00123         {                                                                       \
00124             ptr_item=ptr_data;                                                  \
00125         }                                                                       \
00126         else if (isblank(*ptr_data) && ptr_item)                                \
00127         {                                                                       \
00128             append_item(items, ptr_data, ptr_item);                             \
00129             ptr_item=NULL;                                                      \
00130             nf++;                                                               \
00131         }                                                                       \
00132                                                                                 \
00133         ptr_data++;                                                             \
00134     }                                                                           \
00135                                                                                 \
00136     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);                  \
00137     SG_FREE(data);                                                              \
00138                                                                                 \
00139     /* now copy data into matrix */                                             \
00140     matrix=SG_MALLOC(sg_type, num_vec*num_feat);                                        \
00141     for (int32_t i=0; i<num_vec; i++)                                           \
00142     {                                                                           \
00143         for (int32_t j=0; j<num_feat; j++)                                      \
00144         {                                                                       \
00145             char* item=items->get_element(i*num_feat+j);                        \
00146             matrix[i*num_feat+j]=conv(item);                                    \
00147             SG_FREE(item);                                                      \
00148         }                                                                       \
00149     }                                                                           \
00150     delete items;                                                               \
00151 }
00152 
00153 GET_MATRIX(get_matrix, atoi, uint8_t)
00154 GET_MATRIX(get_int8_matrix, atoi, int8_t)
00155 GET_MATRIX(get_matrix, atoi, char)
00156 GET_MATRIX(get_matrix, atoi, int32_t)
00157 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
00158 GET_MATRIX(get_long_matrix, atoll, int64_t)
00159 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
00160 GET_MATRIX(get_matrix, atof, float32_t)
00161 GET_MATRIX(get_matrix, atof, float64_t)
00162 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
00163 GET_MATRIX(get_matrix, atoi, int16_t)
00164 GET_MATRIX(get_matrix, atoi, uint16_t)
00165 #undef GET_MATRIX
00166 
00167 #define GET_NDARRAY(fname, conv, sg_type)                           \
00168 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims)            \
00169 {                                               \
00170     struct stat stats;                                  \
00171     if (stat(filename, &stats)!=0)                              \
00172         SG_ERROR("Could not get file statistics.\n");                   \
00173                                                 \
00174     char* data=SG_MALLOC(char, stats.st_size+1);                            \
00175     memset(data, 0, sizeof(char)*(stats.st_size+1));                    \
00176     size_t nread=fread(data, sizeof(char), stats.st_size, file);                \
00177     if (nread<=0)                                       \
00178         SG_ERROR("Could not read data from %s.\n", filename);               \
00179                                                 \
00180     SG_DEBUG("data read from file:\n%s\n", data);                       \
00181                                                 \
00182     /* determine size of array */                               \
00183     int32_t length=0;                                   \
00184     int32_t counter=0;                                                                  \
00185     size_t total=0;                                                         \
00186         num_dims = -1;                                                  \
00187     char* ptr_item=NULL;                                    \
00188     char* ptr_data=data;                                    \
00189     DynArray<char*>* items=new DynArray<char*>();                       \
00190                                                                                                 \
00191         /* read line with sizes of array*/                                          \
00192         while(*ptr_data != '\n')                                                        \
00193         {                                                                                       \
00194             if(isblank(*ptr_data) && ptr_item)                                      \
00195             {                                                                                   \
00196                 append_item(items, ptr_data, ptr_item);                         \
00197                 num_dims++;                                                                 \
00198                 ptr_item = NULL;                                                            \
00199             }                                                                                   \
00200             else if(!isblank(*ptr_data) && !ptr_item)                               \
00201                 ptr_item = ptr_data;                                                            \
00202                                                                                                 \
00203             ptr_data++;                                                                         \
00204         }                                                                                       \
00205         ptr_item = NULL;                                                                        \
00206         ptr_data++;                                                                             \
00207                                                     \
00208     /* read array data*/                                                                    \
00209     while(*ptr_data)                                    \
00210     {                                           \
00211         if (*ptr_data=='\n')                                \
00212         {                                       \
00213             if (ptr_item)                               \
00214                 counter++;                          \
00215                                                 \
00216             if (length!=0 && counter!=length)                   \
00217                 SG_ERROR("Invalid number of data (%d != %d) in line"        \
00218                 " %d in file %s.\n", length, counter, total, filename);     \
00219                                                 \
00220             append_item(items, ptr_data, ptr_item);                 \
00221             length=counter;                             \
00222             total++;                                \
00223             counter=0;                              \
00224             ptr_item=NULL;                              \
00225         }                                       \
00226         else if (!isblank(*ptr_data) && !ptr_item)                  \
00227         {                                       \
00228             ptr_item=ptr_data;                          \
00229         }                                       \
00230         else if (isblank(*ptr_data) && ptr_item)                    \
00231         {                                       \
00232             append_item(items, ptr_data, ptr_item);                 \
00233             ptr_item=NULL;                              \
00234             counter++;                              \
00235         }                                       \
00236                                                 \
00237         ptr_data++;                                 \
00238     }                                           \
00239                                                 \
00240     SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total);         \
00241     SG_FREE(data);                                      \
00242                                                 \
00243     /* determining sizes of dimensions*/                                                    \
00244         char * item;                                                                            \
00245         item=items->get_element(0);                                                             \
00246         if(atoi(item) != num_dims)                                                              \
00247             SG_ERROR("Invalid number of dimensions!\n");                                    \
00248         SG_FREE(item);                                                                          \
00249         dims = SG_MALLOC(int32_t, num_dims);                                                           \
00250         for(int32_t i =0;i < num_dims;i++)                                                  \
00251         {                                                                                       \
00252             item = items->get_element(i+1);                                             \
00253             dims[i] = atoi(item);                                                               \
00254             SG_FREE(item);                                                                      \
00255         }                                                                                       \
00256         if (dims[num_dims-1] != length)                                                     \
00257             SG_ERROR("Invalid number of lines in file!\n");                             \
00258                                                                                             \
00259         /* converting array data */                             \
00260         total *= length;                                    \
00261     array=SG_MALLOC(sg_type, total);                                    \
00262     for (size_t i=0; i<total; i++)                              \
00263     {                                           \
00264             item=items->get_element(i+(num_dims+1));                \
00265             array[i]=conv(item);                            \
00266             SG_FREE(item);                              \
00267     }                                           \
00268     delete items;                                       \
00269 }
00270 
00271 GET_NDARRAY(get_ndarray, atoi, uint8_t)
00272 GET_NDARRAY(get_int8_ndarray, atoi, int8_t)
00273 GET_NDARRAY(get_ndarray, atoi, char)
00274 GET_NDARRAY(get_ndarray, atoi, int32_t)
00275 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t)
00276 GET_NDARRAY(get_long_ndarray, atoll, int64_t)
00277 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t)
00278 GET_NDARRAY(get_ndarray, atof, float32_t)
00279 GET_NDARRAY(get_ndarray, atof, float64_t)
00280 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t)
00281 GET_NDARRAY(get_ndarray, atoi, int16_t)
00282 GET_NDARRAY(get_ndarray, atoi, uint16_t)
00283 #undef GET_NDARRAY
00284 
00285 #define GET_SPARSEMATRIX(fname, conv, sg_type)                                      \
00286 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec)   \
00287 {   \
00288     size_t blocksize=1024*1024; \
00289     size_t required_blocksize=blocksize;    \
00290     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);   \
00291     \
00292     if (file)   \
00293     {   \
00294         num_vec=0;  \
00295         num_feat=0; \
00296     \
00297         SG_INFO("counting line numbers in file %s\n", filename);    \
00298         size_t sz=blocksize;    \
00299         size_t block_offs=0;    \
00300         size_t old_block_offs=0;    \
00301         fseek(file, 0, SEEK_END);   \
00302         size_t fsize=ftell(file);   \
00303         rewind(file);   \
00304     \
00305         while (sz == blocksize) \
00306         {   \
00307             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00308             for (size_t i=0; i<sz; i++) \
00309             {   \
00310                 block_offs++;   \
00311                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00312                 {   \
00313                     num_vec++;  \
00314                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
00315                     old_block_offs=block_offs;  \
00316                 }   \
00317             }   \
00318             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");    \
00319         }   \
00320     \
00321         SG_INFO("found %d feature vectors\n", num_vec); \
00322         SG_FREE(dummy); \
00323         blocksize=required_blocksize;   \
00324         dummy = SG_MALLOC(uint8_t, blocksize+1); /*allow setting of '\0' at EOL*/   \
00325         matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \
00326         for (int i=0; i<num_vec; i++)   \
00327             new (&matrix[i]) SGSparseVector<sg_type>(); \
00328         rewind(file);   \
00329         sz=blocksize;   \
00330         int32_t lines=0;    \
00331         while (sz == blocksize) \
00332         {   \
00333             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00334     \
00335             size_t old_sz=0;    \
00336             for (size_t i=0; i<sz; i++) \
00337             {   \
00338                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
00339                 {   \
00340                     size_t len=i-old_sz+1;  \
00341                     uint8_t* data=&dummy[old_sz];   \
00342     \
00343                     for (size_t j=0; j<len; j++)    \
00344                         dummy[j]=data[j];   \
00345     \
00346                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);  \
00347                     i=0;    \
00348                     old_sz=0;   \
00349                     sz+=len;    \
00350                 }   \
00351     \
00352                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00353                 {   \
00354     \
00355                     size_t len=i-old_sz;    \
00356                     uint8_t* data=&dummy[old_sz];   \
00357     \
00358                     int32_t dims=0; \
00359                     for (size_t j=0; j<len; j++)    \
00360                     {   \
00361                         if (data[j]==':')   \
00362                             dims++; \
00363                     }   \
00364     \
00365                     if (dims<=0)    \
00366                     {   \
00367                         SG_ERROR("Error in line %d - number of" \
00368                                 " dimensions is %d line is %d characters"   \
00369                                 " long\n line_content:'%.*s'\n", lines, \
00370                                 dims, len, len, (const char*) data);    \
00371                     }   \
00372     \
00373                     SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims);   \
00374     \
00375                     /* skip label part */   \
00376                     size_t j=0; \
00377                     for (; j<len; j++)  \
00378                     {   \
00379                         if (data[j]==':')   \
00380                         {   \
00381                             j=-1; /* file without label*/   \
00382                             break;  \
00383                         }   \
00384     \
00385                         if (data[j]==' ')   \
00386                         {   \
00387                             data[j]='\0';   \
00388     \
00389                             /* skip label part */   \
00390                             break;  \
00391                         }   \
00392                     }   \
00393     \
00394                     int32_t d=0;    \
00395                     j++;    \
00396                     uint8_t* start=&data[j];    \
00397                     for (; j<len; j++)  \
00398                     {   \
00399                         if (data[j]==':')   \
00400                         {   \
00401                             data[j]='\0';   \
00402     \
00403                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;   \
00404                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);    \
00405     \
00406                             j++;    \
00407                             start=&data[j]; \
00408                             for (; j<len; j++)  \
00409                             {   \
00410                                 if (data[j]==' ' || data[j]=='\n')  \
00411                                 {   \
00412                                     data[j]='\0';   \
00413                                     feat[d].entry=(sg_type) conv((const char*) start);  \
00414                                     d++;    \
00415                                     break;  \
00416                                 }   \
00417                             }   \
00418     \
00419                             if (j==len) \
00420                             {   \
00421                                 data[j]='\0';   \
00422                                 feat[dims-1].entry=(sg_type) conv((const char*) start); \
00423                             }   \
00424     \
00425                             j++;    \
00426                             start=&data[j]; \
00427                         }   \
00428                     }   \
00429     \
00430                     matrix[lines].num_feat_entries=dims;    \
00431                     matrix[lines].features=feat;    \
00432     \
00433                     old_sz=i+1; \
00434                     lines++;    \
00435                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");    \
00436                 }   \
00437             }   \
00438         }   \
00439     \
00440         SG_INFO("file successfully read\n");    \
00441     }   \
00442     \
00443     SG_FREE(dummy); \
00444 }
00445 
00446 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool)
00447 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t)
00448 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
00449 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char)
00450 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t)
00451 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
00452 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
00453 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
00454 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t)
00455 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t)
00456 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
00457 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t)
00458 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t)
00459 #undef GET_SPARSEMATRIX
00460 
00461 
00462 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00463 {
00464     size_t blocksize=1024*1024;
00465     size_t required_blocksize=0;
00466     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00467     uint8_t* overflow=NULL;
00468     int32_t overflow_len=0;
00469 
00470     if (file)
00471     {
00472         num_str=0;
00473         max_string_len=0;
00474 
00475         SG_INFO("counting line numbers in file %s\n", filename);
00476         size_t sz=blocksize;
00477         size_t block_offs=0;
00478         size_t old_block_offs=0;
00479         fseek(file, 0, SEEK_END);
00480         size_t fsize=ftell(file);
00481         rewind(file);
00482 
00483         while (sz == blocksize)
00484         {
00485             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00486             for (size_t i=0; i<sz; i++)
00487             {
00488                 block_offs++;
00489                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00490                 {
00491                     num_str++;
00492                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00493                     old_block_offs=block_offs;
00494                 }
00495             }
00496             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00497         }
00498 
00499         SG_INFO("found %d strings\n", num_str);
00500         SG_DEBUG("block_size=%d\n", required_blocksize);
00501         SG_FREE(dummy);
00502         blocksize=required_blocksize;
00503         dummy=SG_MALLOC(uint8_t, blocksize);
00504         overflow=SG_MALLOC(uint8_t, blocksize);
00505         strings=SG_MALLOC(SGString<uint8_t>, num_str);
00506 
00507         rewind(file);
00508         sz=blocksize;
00509         int32_t lines=0;
00510         size_t old_sz=0;
00511         while (sz == blocksize)
00512         {
00513             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00514 
00515             old_sz=0;
00516             for (size_t i=0; i<sz; i++)
00517             {
00518                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00519                 {
00520                     int32_t len=i-old_sz;
00521                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00522 
00523                     strings[lines].slen=len+overflow_len;
00524                     strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len);
00525 
00526                     for (int32_t j=0; j<overflow_len; j++)
00527                         strings[lines].string[j]=overflow[j];
00528                     for (int32_t j=0; j<len; j++)
00529                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00530 
00531                     // clear overflow
00532                     overflow_len=0;
00533 
00534                     //CMath::display_vector(strings[lines].string, len);
00535                     old_sz=i+1;
00536                     lines++;
00537                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00538                 }
00539             }
00540 
00541             for (size_t i=old_sz; i<sz; i++)
00542                 overflow[i-old_sz]=dummy[i];
00543 
00544             overflow_len=sz-old_sz;
00545         }
00546         SG_INFO("file successfully read\n");
00547         SG_INFO("max_string_length=%d\n", max_string_len);
00548         SG_INFO("num_strings=%d\n", num_str);
00549     }
00550 
00551     SG_FREE(dummy);
00552     SG_FREE(overflow);
00553 }
00554 
00555 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00556 {
00557     size_t blocksize=1024*1024;
00558     size_t required_blocksize=0;
00559     int8_t* dummy=SG_MALLOC(int8_t, blocksize);
00560     int8_t* overflow=NULL;
00561     int32_t overflow_len=0;
00562 
00563     if (file)
00564     {
00565         num_str=0;
00566         max_string_len=0;
00567 
00568         SG_INFO("counting line numbers in file %s\n", filename);
00569         size_t sz=blocksize;
00570         size_t block_offs=0;
00571         size_t old_block_offs=0;
00572         fseek(file, 0, SEEK_END);
00573         size_t fsize=ftell(file);
00574         rewind(file);
00575 
00576         while (sz == blocksize)
00577         {
00578             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00579             for (size_t i=0; i<sz; i++)
00580             {
00581                 block_offs++;
00582                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00583                 {
00584                     num_str++;
00585                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00586                     old_block_offs=block_offs;
00587                 }
00588             }
00589             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00590         }
00591 
00592         SG_INFO("found %d strings\n", num_str);
00593         SG_DEBUG("block_size=%d\n", required_blocksize);
00594         SG_FREE(dummy);
00595         blocksize=required_blocksize;
00596         dummy=SG_MALLOC(int8_t, blocksize);
00597         overflow=SG_MALLOC(int8_t, blocksize);
00598         strings=SG_MALLOC(SGString<int8_t>, num_str);
00599 
00600         rewind(file);
00601         sz=blocksize;
00602         int32_t lines=0;
00603         size_t old_sz=0;
00604         while (sz == blocksize)
00605         {
00606             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00607 
00608             old_sz=0;
00609             for (size_t i=0; i<sz; i++)
00610             {
00611                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00612                 {
00613                     int32_t len=i-old_sz;
00614                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00615 
00616                     strings[lines].slen=len+overflow_len;
00617                     strings[lines].string=SG_MALLOC(int8_t, len+overflow_len);
00618 
00619                     for (int32_t j=0; j<overflow_len; j++)
00620                         strings[lines].string[j]=overflow[j];
00621                     for (int32_t j=0; j<len; j++)
00622                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00623 
00624                     // clear overflow
00625                     overflow_len=0;
00626 
00627                     //CMath::display_vector(strings[lines].string, len);
00628                     old_sz=i+1;
00629                     lines++;
00630                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00631                 }
00632             }
00633 
00634             for (size_t i=old_sz; i<sz; i++)
00635                 overflow[i-old_sz]=dummy[i];
00636 
00637             overflow_len=sz-old_sz;
00638         }
00639         SG_INFO("file successfully read\n");
00640         SG_INFO("max_string_length=%d\n", max_string_len);
00641         SG_INFO("num_strings=%d\n", num_str);
00642     }
00643 
00644     SG_FREE(dummy);
00645     SG_FREE(overflow);
00646 }
00647 
00648 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00649 {
00650     size_t blocksize=1024*1024;
00651     size_t required_blocksize=0;
00652     char* dummy=SG_MALLOC(char, blocksize);
00653     char* overflow=NULL;
00654     int32_t overflow_len=0;
00655 
00656     if (file)
00657     {
00658         num_str=0;
00659         max_string_len=0;
00660 
00661         SG_INFO("counting line numbers in file %s\n", filename);
00662         size_t sz=blocksize;
00663         size_t block_offs=0;
00664         size_t old_block_offs=0;
00665         fseek(file, 0, SEEK_END);
00666         size_t fsize=ftell(file);
00667         rewind(file);
00668 
00669         while (sz == blocksize)
00670         {
00671             sz=fread(dummy, sizeof(char), blocksize, file);
00672             for (size_t i=0; i<sz; i++)
00673             {
00674                 block_offs++;
00675                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00676                 {
00677                     num_str++;
00678                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00679                     old_block_offs=block_offs;
00680                 }
00681             }
00682             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00683         }
00684 
00685         SG_INFO("found %d strings\n", num_str);
00686         SG_DEBUG("block_size=%d\n", required_blocksize);
00687         SG_FREE(dummy);
00688         blocksize=required_blocksize;
00689         dummy=SG_MALLOC(char, blocksize);
00690         overflow=SG_MALLOC(char, blocksize);
00691         strings=SG_MALLOC(SGString<char>, num_str);
00692 
00693         rewind(file);
00694         sz=blocksize;
00695         int32_t lines=0;
00696         size_t old_sz=0;
00697         while (sz == blocksize)
00698         {
00699             sz=fread(dummy, sizeof(char), blocksize, file);
00700 
00701             old_sz=0;
00702             for (size_t i=0; i<sz; i++)
00703             {
00704                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00705                 {
00706                     int32_t len=i-old_sz;
00707                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00708 
00709                     strings[lines].slen=len+overflow_len;
00710                     strings[lines].string=SG_MALLOC(char, len+overflow_len);
00711 
00712                     for (int32_t j=0; j<overflow_len; j++)
00713                         strings[lines].string[j]=overflow[j];
00714                     for (int32_t j=0; j<len; j++)
00715                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00716 
00717                     // clear overflow
00718                     overflow_len=0;
00719 
00720                     //CMath::display_vector(strings[lines].string, len);
00721                     old_sz=i+1;
00722                     lines++;
00723                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00724                 }
00725             }
00726 
00727             for (size_t i=old_sz; i<sz; i++)
00728                 overflow[i-old_sz]=dummy[i];
00729 
00730             overflow_len=sz-old_sz;
00731         }
00732         SG_INFO("file successfully read\n");
00733         SG_INFO("max_string_length=%d\n", max_string_len);
00734         SG_INFO("num_strings=%d\n", num_str);
00735     }
00736 
00737     SG_FREE(dummy);
00738     SG_FREE(overflow);
00739 }
00740 
00741 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00742 {
00743     strings=NULL;
00744     num_str=0;
00745     max_string_len=0;
00746 }
00747 
00748 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00749 {
00750     strings=NULL;
00751     num_str=0;
00752     max_string_len=0;
00753 }
00754 
00755 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00756 {
00757     strings=NULL;
00758     num_str=0;
00759     max_string_len=0;
00760 }
00761 
00762 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00763 {
00764     strings=NULL;
00765     num_str=0;
00766     max_string_len=0;
00767 }
00768 
00769 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00770 {
00771     strings=NULL;
00772     num_str=0;
00773     max_string_len=0;
00774 }
00775 
00776 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00777 {
00778     strings=NULL;
00779     num_str=0;
00780     max_string_len=0;
00781 }
00782 
00783 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00784 {
00785     strings=NULL;
00786     num_str=0;
00787     max_string_len=0;
00788 }
00789 
00790 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00791 {
00792     strings=NULL;
00793     num_str=0;
00794     max_string_len=0;
00795 }
00796 
00797 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00798 {
00799     strings=NULL;
00800     num_str=0;
00801     max_string_len=0;
00802 }
00803 
00804 
00807 #define SET_VECTOR(fname, mfname, sg_type)  \
00808 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
00809 {                                                           \
00810     mfname(vec, len, 1);                                    \
00811 }
00812 SET_VECTOR(set_vector, set_int8_matrix, int8_t)
00813 SET_VECTOR(set_vector, set_matrix, uint8_t)
00814 SET_VECTOR(set_vector, set_matrix, char)
00815 SET_VECTOR(set_vector, set_matrix, int32_t)
00816 SET_VECTOR(set_vector, set_uint_matrix, uint32_t)
00817 SET_VECTOR(set_vector, set_matrix, float32_t)
00818 SET_VECTOR(set_vector, set_matrix, float64_t)
00819 SET_VECTOR(set_vector, set_longreal_matrix, floatmax_t)
00820 SET_VECTOR(set_vector, set_matrix, int16_t)
00821 SET_VECTOR(set_vector, set_matrix, uint16_t)
00822 SET_VECTOR(set_vector, set_long_matrix, int64_t)
00823 SET_VECTOR(set_vector, set_ulong_matrix, uint64_t)
00824 #undef SET_VECTOR
00825 
00826 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
00827 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec)    \
00828 {                                                                                   \
00829     if (!(file && matrix))                                                          \
00830         SG_ERROR("File or matrix invalid.\n");                                      \
00831                                                                                     \
00832     for (int32_t i=0; i<num_vec; i++)                                               \
00833     {                                                                               \
00834         for (int32_t j=0; j<num_feat; j++)                                          \
00835         {                                                                           \
00836             sg_type v=matrix[num_feat*i+j];                                         \
00837             if (j==num_feat-1)                                                      \
00838                 fprintf(file, type_str "\n", (fprt_type) v);                        \
00839             else                                                                    \
00840                 fprintf(file, type_str " ", (fprt_type) v);                         \
00841         }                                                                           \
00842     }                                                                               \
00843 }
00844 SET_MATRIX(set_matrix, char, char, "%c")
00845 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u")
00846 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
00847 SET_MATRIX(set_matrix, int32_t, int32_t, "%i")
00848 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
00849 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
00850 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
00851 SET_MATRIX(set_matrix, int16_t, int16_t, "%i")
00852 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u")
00853 SET_MATRIX(set_matrix, float32_t, float32_t, "%.16g")
00854 SET_MATRIX(set_matrix, float64_t, float64_t, "%.16lg")
00855 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%.16Lg")
00856 #undef SET_MATRIX
00857 
00858 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \
00859 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims)  \
00860 {                                       \
00861     if (!(file && array))                           \
00862         SG_ERROR("File or data invalid.\n");                \
00863                                         \
00864         size_t total = 1;                               \
00865         for(int i = 0;i < num_dims;i++)                         \
00866             total *= dims[i];                                           \
00867         int32_t block_size = dims[num_dims-1];                                  \
00868                                                                         \
00869         fprintf(file,"%d ",num_dims);                       \
00870         for(int i = 0;i < num_dims;i++)                     \
00871             fprintf(file,"%d ",dims[i]);                        \
00872         fprintf(file,"\n");                             \
00873                                                                                 \
00874         for (size_t i=0; i < total; i++)                    \
00875     {                                   \
00876         sg_type v= array[i];                        \
00877         if ( ((i+1) % block_size) == 0)                 \
00878             fprintf(file, type_str "\n", (fprt_type) v);        \
00879         else                                \
00880             fprintf(file, type_str " ", (fprt_type) v);     \
00881     }                                   \
00882 }
00883 
00884 SET_NDARRAY(set_ndarray, char, char, "%c")
00885 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u")
00886 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d")
00887 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i")
00888 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u")
00889 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli")
00890 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu")
00891 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i")
00892 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u")
00893 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f")
00894 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f")
00895 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf")
00896 #undef SET_NDARRAY
00897 
00898 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
00899 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec)    \
00900 {                                                                                           \
00901     if (!(file && matrix))                                                                  \
00902         SG_ERROR("File or matrix invalid.\n");                                              \
00903                                                                                             \
00904     for (int32_t i=0; i<num_vec; i++)                                                       \
00905     {                                                                                       \
00906         SGSparseVectorEntry<sg_type>* vec = matrix[i].features;                                 \
00907         int32_t len=matrix[i].num_feat_entries;                                             \
00908                                                                                             \
00909         for (int32_t j=0; j<len; j++)                                                       \
00910         {                                                                                   \
00911             if (j<len-1)                                                                    \
00912             {                                                                               \
00913                 fprintf(file, "%d:" type_str " ",                                           \
00914                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00915             }                                                                               \
00916             else                                                                            \
00917             {                                                                               \
00918                 fprintf(file, "%d:" type_str "\n",                                          \
00919                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00920             }                                                                               \
00921         }                                                                                   \
00922     }                                                                                       \
00923 }
00924 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u")
00925 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c")
00926 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u")
00927 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
00928 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i")
00929 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
00930 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
00931 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
00932 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i")
00933 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u")
00934 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f")
00935 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f")
00936 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
00937 #undef SET_SPARSEMATRIX
00938 
00939 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str)
00940 {
00941     if (!(file && strings))
00942         SG_ERROR("File or strings invalid.\n");
00943 
00944     for (int32_t i=0; i<num_str; i++)
00945     {
00946         int32_t len = strings[i].slen;
00947         fwrite(strings[i].string, sizeof(uint8_t), len, file);
00948         fprintf(file, "\n");
00949     }
00950 }
00951 
00952 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str)
00953 {
00954     if (!(file && strings))
00955         SG_ERROR("File or strings invalid.\n");
00956 
00957     for (int32_t i=0; i<num_str; i++)
00958     {
00959         int32_t len = strings[i].slen;
00960         fwrite(strings[i].string, sizeof(int8_t), len, file);
00961         fprintf(file, "\n");
00962     }
00963 }
00964 
00965 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str)
00966 {
00967     if (!(file && strings))
00968         SG_ERROR("File or strings invalid.\n");
00969 
00970     for (int32_t i=0; i<num_str; i++)
00971     {
00972         int32_t len = strings[i].slen;
00973         fwrite(strings[i].string, sizeof(char), len, file);
00974         fprintf(file, "\n");
00975     }
00976 }
00977 
00978 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str)
00979 {
00980 }
00981 
00982 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str)
00983 {
00984 }
00985 
00986 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str)
00987 {
00988 }
00989 
00990 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str)
00991 {
00992 }
00993 
00994 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str)
00995 {
00996 }
00997 
00998 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str)
00999 {
01000 }
01001 
01002 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str)
01003 {
01004 }
01005 
01006 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str)
01007 {
01008 }
01009 
01010 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str)
01011 {
01012 }
01013 
01014 template <class T> void CAsciiFile::append_item(
01015     DynArray<T>* items, char* ptr_data, char* ptr_item)
01016 {
01017     size_t len=(ptr_data-ptr_item)/sizeof(char);
01018     char* item=SG_MALLOC(char, len+1);
01019     memset(item, 0, sizeof(char)*(len+1));
01020     item=strncpy(item, ptr_item, len);
01021 
01022     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
01023     items->append_element(item);
01024 }
01025 
01026 #if defined(__MACH__) || defined(FREEBSD)
01027 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01028 {
01029     int32_t total_bytes_read=0;
01030     int32_t default_size=10;
01031 
01032     if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
01033         return -1;
01034 
01035     if ((*lineptr == NULL) && (*n == 0))
01036     {
01037         *lineptr=SG_MALLOC(char, default_size);
01038         *n=default_size;
01039     }
01040 
01041     int32_t bytes_read, pos=-1;
01042     size_t threshold_size=100000;
01043 
01044     while (1)
01045     {
01046         // We need some limit in case file does not contain '\n'
01047         if (*n > threshold_size)
01048             return -1;
01049 
01050         // Read from file and append to buffer
01051         bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);
01052 
01053         for (int i=0; i<bytes_read; i++)
01054         {
01055             if ((*lineptr)[total_bytes_read+i] == delimiter)
01056             {
01057                 pos=i;
01058                 break;
01059             }
01060         }
01061 
01062         if (pos==-1)
01063         {
01064             if (feof(stream))
01065                 return -1;
01066             total_bytes_read+=bytes_read;
01067             *lineptr=SG_REALLOC(char, *lineptr, (*n)*2);
01068             *n=(*n)*2;
01069             // A better reallocated size should be used
01070         }
01071         else
01072         {
01073             total_bytes_read+=pos+1;
01074             (*lineptr)[total_bytes_read]='\0';
01075             // Seek back to position after \n
01076             fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
01077             return total_bytes_read;
01078         }
01079     }
01080 }
01081 
01082 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01083 {
01084     return getdelim(lineptr, n, '\n', stream);
01085 }
01086 
01087 #else
01088 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
01089 {
01090     return ::getdelim(lineptr, n, delimiter, stream);
01091 }
01092 
01093 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
01094 {
01095     return ::getline(lineptr, n, stream);
01096 }
01097 #endif
01098 
01099 void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
01100 {
01101     ret.erase();
01102     char *last = s.start;
01103     for (; s.start != s.end; s.start++)
01104     {
01105         if (*s.start == delim)
01106         {
01107             if (s.start != last)
01108             {
01109                 substring temp = {last,s.start};
01110                 ret.push(temp);
01111             }
01112             last = s.start+1;
01113         }
01114     }
01115     if (s.start != last)
01116     {
01117         substring final = {last, s.start};
01118         ret.push(final);
01119     }
01120 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation