AsciiFile.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include "features/SparseFeatures.h"
00012 #include "lib/File.h"
00013 #include "lib/AsciiFile.h"
00014 #include "lib/Mathematics.h"
00015 #include <ctype.h>
00016 
00017 using namespace shogun;
00018 
00019 CAsciiFile::CAsciiFile(void)
00020 {
00021     SG_UNSTABLE("CAsciiFile::CAsciiFile(void)", "\n");
00022 }
00023 
00024 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name)
00025 {
00026 }
00027 
00028 CAsciiFile::CAsciiFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00029 {
00030 }
00031 
00032 CAsciiFile::~CAsciiFile()
00033 {
00034 }
00035 
00036 #define GET_VECTOR(fname, mfname, sg_type) \
00037 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \
00038 {                                                   \
00039     vec=NULL;                                       \
00040     len=0;                                          \
00041     int32_t num_feat=0;                             \
00042     int32_t num_vec=0;                              \
00043     mfname(vec, num_feat, num_vec);                 \
00044     if ((num_feat==1) || (num_vec==1))              \
00045     {                                               \
00046         if (num_feat==1)                            \
00047             len=num_vec;                            \
00048         else                                        \
00049             len=num_feat;                           \
00050     }                                               \
00051     else                                            \
00052     {                                               \
00053         delete[] vec;                               \
00054         vec=NULL;                                   \
00055         len=0;                                      \
00056         SG_ERROR("Could not read vector from"       \
00057                 " file %s (shape %dx%d found but "  \
00058                 "vector expected).\n", filename,    \
00059                 num_vec, num_feat);                 \
00060     }                                               \
00061 }
00062 
00063 GET_VECTOR(get_byte_vector, get_byte_matrix, uint8_t)
00064 GET_VECTOR(get_char_vector, get_char_matrix, char)
00065 GET_VECTOR(get_int_vector, get_int_matrix, int32_t)
00066 GET_VECTOR(get_shortreal_vector, get_shortreal_matrix, float32_t)
00067 GET_VECTOR(get_real_vector, get_real_matrix, float64_t)
00068 GET_VECTOR(get_short_vector, get_short_matrix, int16_t)
00069 GET_VECTOR(get_word_vector, get_word_matrix, uint16_t)
00070 #undef GET_VECTOR
00071 
00072 #define GET_MATRIX(fname, conv, sg_type)                                        \
00073 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec)   \
00074 {                                                                               \
00075     struct stat stats;                                                          \
00076     if (stat(filename, &stats)!=0)                                              \
00077         SG_ERROR("Could not get file statistics.\n");                           \
00078                                                                                 \
00079     char* data=new char[stats.st_size+1];                                       \
00080     memset(data, 0, sizeof(char)*(stats.st_size+1));                            \
00081     size_t nread=fread(data, sizeof(char), stats.st_size, file);                \
00082     if (nread<=0)                                                               \
00083         SG_ERROR("Could not read data from %s.\n", filename);                   \
00084                                                                                 \
00085     SG_DEBUG("data read from file:\n%s\n", data);                               \
00086                                                                                 \
00087     /* determine num_feat and num_vec, populate dynamic array */                \
00088     int32_t nf=0;                                                               \
00089     num_feat=0;                                                                 \
00090     num_vec=0;                                                                  \
00091     char* ptr_item=NULL;                                                        \
00092     char* ptr_data=data;                                                        \
00093     DynArray<char*>* items=new DynArray<char*>();                       \
00094                                                                                 \
00095     while (*ptr_data)                                                           \
00096     {                                                                           \
00097         if (*ptr_data=='\n')                                                    \
00098         {                                                                       \
00099             if (ptr_item)                                                       \
00100                 nf++;                                                           \
00101                                                                                 \
00102             if (num_feat!=0 && nf!=num_feat)                                    \
00103                 SG_ERROR("Number of features mismatches (%d != %d) in vector"   \
00104                         " %d in file %s.\n", num_feat, nf, num_vec, filename);  \
00105                                                                                 \
00106             append_item(items, ptr_data, ptr_item);                             \
00107             num_feat=nf;                                                        \
00108             num_vec++;                                                          \
00109             nf=0;                                                               \
00110             ptr_item=NULL;                                                      \
00111         }                                                                       \
00112         else if (!isblank(*ptr_data) && !ptr_item)                              \
00113         {                                                                       \
00114             ptr_item=ptr_data;                                                  \
00115         }                                                                       \
00116         else if (isblank(*ptr_data) && ptr_item)                                \
00117         {                                                                       \
00118             append_item(items, ptr_data, ptr_item);                             \
00119             ptr_item=NULL;                                                      \
00120             nf++;                                                               \
00121         }                                                                       \
00122                                                                                 \
00123         ptr_data++;                                                             \
00124     }                                                                           \
00125                                                                                 \
00126     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);                  \
00127     delete[] data;                                                              \
00128                                                                                 \
00129     /* now copy data into matrix */                                             \
00130     matrix=new sg_type[num_vec*num_feat];                                       \
00131     for (int32_t i=0; i<num_vec; i++)                                           \
00132     {                                                                           \
00133         for (int32_t j=0; j<num_feat; j++)                                      \
00134         {                                                                       \
00135             char* item=items->get_element(i*num_feat+j);                        \
00136             matrix[i*num_feat+j]=conv(item);                                    \
00137             delete[] item;                                                      \
00138         }                                                                       \
00139     }                                                                           \
00140     delete items;                                                               \
00141 }
00142 
00143 GET_MATRIX(get_byte_matrix, atoi, uint8_t)
00144 GET_MATRIX(get_int8_matrix, atoi, int8_t)
00145 GET_MATRIX(get_char_matrix, atoi, char)
00146 GET_MATRIX(get_int_matrix, atoi, int32_t)
00147 GET_MATRIX(get_uint_matrix, atoi, uint32_t)
00148 GET_MATRIX(get_long_matrix, atoll, int64_t)
00149 GET_MATRIX(get_ulong_matrix, atoll, uint64_t)
00150 GET_MATRIX(get_shortreal_matrix, atof, float32_t)
00151 GET_MATRIX(get_real_matrix, atof, float64_t)
00152 GET_MATRIX(get_longreal_matrix, atof, floatmax_t)
00153 GET_MATRIX(get_short_matrix, atoi, int16_t)
00154 GET_MATRIX(get_word_matrix, atoi, uint16_t)
00155 #undef GET_MATRIX
00156 
00157 void CAsciiFile::get_byte_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
00158 {
00159 }
00160 
00161 void CAsciiFile::get_char_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
00162 {
00163 }
00164 
00165 void CAsciiFile::get_int_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
00166 {
00167 }
00168 
00169 void CAsciiFile::get_shortreal_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
00170 {
00171 }
00172 
00173 void CAsciiFile::get_real_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
00174 {
00175 }
00176 
00177 void CAsciiFile::get_short_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
00178 {
00179 }
00180 
00181 void CAsciiFile::get_word_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
00182 {
00183 }
00184 
00185 #define GET_SPARSEMATRIX(fname, conv, sg_type)                                      \
00186 void CAsciiFile::fname(TSparse<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec)  \
00187 {   \
00188     size_t blocksize=1024*1024; \
00189     size_t required_blocksize=blocksize;    \
00190     uint8_t* dummy=new uint8_t[blocksize];  \
00191     \
00192     if (file)   \
00193     {   \
00194         num_vec=0;  \
00195         num_feat=0; \
00196     \
00197         SG_INFO("counting line numbers in file %s\n", filename);    \
00198         size_t sz=blocksize;    \
00199         size_t block_offs=0;    \
00200         size_t old_block_offs=0;    \
00201         fseek(file, 0, SEEK_END);   \
00202         size_t fsize=ftell(file);   \
00203         rewind(file);   \
00204     \
00205         while (sz == blocksize) \
00206         {   \
00207             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00208             bool contains_cr=false; \
00209             for (size_t i=0; i<sz; i++) \
00210             {   \
00211                 block_offs++;   \
00212                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00213                 {   \
00214                     num_vec++;  \
00215                     contains_cr=true;   \
00216                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \
00217                     old_block_offs=block_offs;  \
00218                 }   \
00219             }   \
00220             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");    \
00221         }   \
00222     \
00223         SG_INFO("found %d feature vectors\n", num_vec); \
00224         delete[] dummy; \
00225         blocksize=required_blocksize;   \
00226         dummy = new uint8_t[blocksize+1]; /*allow setting of '\0' at EOL*/  \
00227         matrix=new TSparse<sg_type>[num_vec];   \
00228     \
00229         rewind(file);   \
00230         sz=blocksize;   \
00231         int32_t lines=0;    \
00232         while (sz == blocksize) \
00233         {   \
00234             sz=fread(dummy, sizeof(uint8_t), blocksize, file);  \
00235     \
00236             size_t old_sz=0;    \
00237             for (size_t i=0; i<sz; i++) \
00238             {   \
00239                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \
00240                 {   \
00241                     size_t len=i-old_sz+1;  \
00242                     uint8_t* data=&dummy[old_sz];   \
00243     \
00244                     for (size_t j=0; j<len; j++)    \
00245                         dummy[j]=data[j];   \
00246     \
00247                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);  \
00248                     i=0;    \
00249                     old_sz=0;   \
00250                     sz+=len;    \
00251                 }   \
00252     \
00253                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))    \
00254                 {   \
00255     \
00256                     size_t len=i-old_sz;    \
00257                     uint8_t* data=&dummy[old_sz];   \
00258     \
00259                     int32_t dims=0; \
00260                     for (size_t j=0; j<len; j++)    \
00261                     {   \
00262                         if (data[j]==':')   \
00263                             dims++; \
00264                     }   \
00265     \
00266                     if (dims<=0)    \
00267                     {   \
00268                         SG_ERROR("Error in line %d - number of" \
00269                                 " dimensions is %d line is %d characters"   \
00270                                 " long\n line_content:'%.*s'\n", lines, \
00271                                 dims, len, len, (const char*) data);    \
00272                     }   \
00273     \
00274                     TSparseEntry<sg_type>* feat=new TSparseEntry<sg_type>[dims];    \
00275     \
00276                     /* skip label part */   \
00277                     size_t j=0; \
00278                     for (; j<len; j++)  \
00279                     {   \
00280                         if (data[j]==':')   \
00281                         {   \
00282                             j=-1; /* file without label*/   \
00283                             break;  \
00284                         }   \
00285     \
00286                         if (data[j]==' ')   \
00287                         {   \
00288                             data[j]='\0';   \
00289     \
00290                             /* skip label part */   \
00291                             break;  \
00292                         }   \
00293                     }   \
00294     \
00295                     int32_t d=0;    \
00296                     j++;    \
00297                     uint8_t* start=&data[j];    \
00298                     for (; j<len; j++)  \
00299                     {   \
00300                         if (data[j]==':')   \
00301                         {   \
00302                             data[j]='\0';   \
00303     \
00304                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;   \
00305                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);    \
00306     \
00307                             j++;    \
00308                             start=&data[j]; \
00309                             for (; j<len; j++)  \
00310                             {   \
00311                                 if (data[j]==' ' || data[j]=='\n')  \
00312                                 {   \
00313                                     data[j]='\0';   \
00314                                     feat[d].entry=(sg_type) conv((const char*) start);  \
00315                                     d++;    \
00316                                     break;  \
00317                                 }   \
00318                             }   \
00319     \
00320                             if (j==len) \
00321                             {   \
00322                                 data[j]='\0';   \
00323                                 feat[dims-1].entry=(sg_type) conv((const char*) start); \
00324                             }   \
00325     \
00326                             j++;    \
00327                             start=&data[j]; \
00328                         }   \
00329                     }   \
00330     \
00331                     matrix[lines].vec_index=lines;  \
00332                     matrix[lines].num_feat_entries=dims;    \
00333                     matrix[lines].features=feat;    \
00334     \
00335                     old_sz=i+1; \
00336                     lines++;    \
00337                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");    \
00338                 }   \
00339             }   \
00340         }   \
00341     \
00342         SG_INFO("file successfully read\n");    \
00343     }   \
00344     \
00345     delete[] dummy; \
00346 }
00347 
00348 GET_SPARSEMATRIX(get_bool_sparsematrix, atoi, bool)
00349 GET_SPARSEMATRIX(get_byte_sparsematrix, atoi, uint8_t)
00350 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t)
00351 GET_SPARSEMATRIX(get_char_sparsematrix, atoi, char)
00352 GET_SPARSEMATRIX(get_int_sparsematrix, atoi, int32_t)
00353 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t)
00354 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t)
00355 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t)
00356 GET_SPARSEMATRIX(get_shortreal_sparsematrix, atof, float32_t)
00357 GET_SPARSEMATRIX(get_real_sparsematrix, atof, float64_t)
00358 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t)
00359 GET_SPARSEMATRIX(get_short_sparsematrix, atoi, int16_t)
00360 GET_SPARSEMATRIX(get_word_sparsematrix, atoi, uint16_t)
00361 #undef GET_SPARSEMATRIX
00362 
00363 
00364 void CAsciiFile::get_byte_string_list(TString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00365 {
00366     size_t blocksize=1024*1024;
00367     size_t required_blocksize=0;
00368     uint8_t* dummy=new uint8_t[blocksize];
00369     uint8_t* overflow=NULL;
00370     int32_t overflow_len=0;
00371 
00372     if (file)
00373     {
00374         num_str=0;
00375         max_string_len=0;
00376 
00377         SG_INFO("counting line numbers in file %s\n", filename);
00378         size_t sz=blocksize;
00379         size_t block_offs=0;
00380         size_t old_block_offs=0;
00381         fseek(file, 0, SEEK_END);
00382         size_t fsize=ftell(file);
00383         rewind(file);
00384 
00385         while (sz == blocksize)
00386         {
00387             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00388             bool contains_cr=false;
00389             for (size_t i=0; i<sz; i++)
00390             {
00391                 block_offs++;
00392                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00393                 {
00394                     num_str++;
00395                     contains_cr=true;
00396                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00397                     old_block_offs=block_offs;
00398                 }
00399             }
00400             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00401         }
00402 
00403         SG_INFO("found %d strings\n", num_str);
00404         SG_DEBUG("block_size=%d\n", required_blocksize);
00405         delete[] dummy;
00406         blocksize=required_blocksize;
00407         dummy=new uint8_t[blocksize];
00408         overflow=new uint8_t[blocksize];
00409         strings=new TString<uint8_t>[num_str];
00410 
00411         rewind(file);
00412         sz=blocksize;
00413         int32_t lines=0;
00414         size_t old_sz=0;
00415         while (sz == blocksize)
00416         {
00417             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00418 
00419             old_sz=0;
00420             for (size_t i=0; i<sz; i++)
00421             {
00422                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00423                 {
00424                     int32_t len=i-old_sz;
00425                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00426 
00427                     strings[lines].length=len+overflow_len;
00428                     strings[lines].string=new uint8_t[len+overflow_len];
00429 
00430                     for (int32_t j=0; j<overflow_len; j++)
00431                         strings[lines].string[j]=overflow[j];
00432                     for (int32_t j=0; j<len; j++)
00433                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00434 
00435                     // clear overflow
00436                     overflow_len=0;
00437 
00438                     //CMath::display_vector(strings[lines].string, len);
00439                     old_sz=i+1;
00440                     lines++;
00441                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00442                 }
00443             }
00444 
00445             for (size_t i=old_sz; i<sz; i++)
00446                 overflow[i-old_sz]=dummy[i];
00447 
00448             overflow_len=sz-old_sz;
00449         }
00450         SG_INFO("file successfully read\n");
00451         SG_INFO("max_string_length=%d\n", max_string_len);
00452         SG_INFO("num_strings=%d\n", num_str);
00453     }
00454 
00455     delete[] dummy;
00456     delete[] overflow;
00457 }
00458 
00459 void CAsciiFile::get_int8_string_list(TString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00460 {
00461     size_t blocksize=1024*1024;
00462     size_t required_blocksize=0;
00463     int8_t* dummy=new int8_t[blocksize];
00464     int8_t* overflow=NULL;
00465     int32_t overflow_len=0;
00466 
00467     if (file)
00468     {
00469         num_str=0;
00470         max_string_len=0;
00471 
00472         SG_INFO("counting line numbers in file %s\n", filename);
00473         size_t sz=blocksize;
00474         size_t block_offs=0;
00475         size_t old_block_offs=0;
00476         fseek(file, 0, SEEK_END);
00477         size_t fsize=ftell(file);
00478         rewind(file);
00479 
00480         while (sz == blocksize)
00481         {
00482             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00483             bool contains_cr=false;
00484             for (size_t i=0; i<sz; i++)
00485             {
00486                 block_offs++;
00487                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00488                 {
00489                     num_str++;
00490                     contains_cr=true;
00491                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00492                     old_block_offs=block_offs;
00493                 }
00494             }
00495             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00496         }
00497 
00498         SG_INFO("found %d strings\n", num_str);
00499         SG_DEBUG("block_size=%d\n", required_blocksize);
00500         delete[] dummy;
00501         blocksize=required_blocksize;
00502         dummy=new int8_t[blocksize];
00503         overflow=new int8_t[blocksize];
00504         strings=new TString<int8_t>[num_str];
00505 
00506         rewind(file);
00507         sz=blocksize;
00508         int32_t lines=0;
00509         size_t old_sz=0;
00510         while (sz == blocksize)
00511         {
00512             sz=fread(dummy, sizeof(int8_t), blocksize, file);
00513 
00514             old_sz=0;
00515             for (size_t i=0; i<sz; i++)
00516             {
00517                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00518                 {
00519                     int32_t len=i-old_sz;
00520                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00521 
00522                     strings[lines].length=len+overflow_len;
00523                     strings[lines].string=new int8_t[len+overflow_len];
00524 
00525                     for (int32_t j=0; j<overflow_len; j++)
00526                         strings[lines].string[j]=overflow[j];
00527                     for (int32_t j=0; j<len; j++)
00528                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00529 
00530                     // clear overflow
00531                     overflow_len=0;
00532 
00533                     //CMath::display_vector(strings[lines].string, len);
00534                     old_sz=i+1;
00535                     lines++;
00536                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00537                 }
00538             }
00539 
00540             for (size_t i=old_sz; i<sz; i++)
00541                 overflow[i-old_sz]=dummy[i];
00542 
00543             overflow_len=sz-old_sz;
00544         }
00545         SG_INFO("file successfully read\n");
00546         SG_INFO("max_string_length=%d\n", max_string_len);
00547         SG_INFO("num_strings=%d\n", num_str);
00548     }
00549 
00550     delete[] dummy;
00551     delete[] overflow;
00552 }
00553 
00554 void CAsciiFile::get_char_string_list(TString<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00555 {
00556     size_t blocksize=1024*1024;
00557     size_t required_blocksize=0;
00558     char* dummy=new char[blocksize];
00559     char* overflow=NULL;
00560     int32_t overflow_len=0;
00561 
00562     if (file)
00563     {
00564         num_str=0;
00565         max_string_len=0;
00566 
00567         SG_INFO("counting line numbers in file %s\n", filename);
00568         size_t sz=blocksize;
00569         size_t block_offs=0;
00570         size_t old_block_offs=0;
00571         fseek(file, 0, SEEK_END);
00572         size_t fsize=ftell(file);
00573         rewind(file);
00574 
00575         while (sz == blocksize)
00576         {
00577             sz=fread(dummy, sizeof(char), blocksize, file);
00578             bool contains_cr=false;
00579             for (size_t i=0; i<sz; i++)
00580             {
00581                 block_offs++;
00582                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00583                 {
00584                     num_str++;
00585                     contains_cr=true;
00586                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00587                     old_block_offs=block_offs;
00588                 }
00589             }
00590             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00591         }
00592 
00593         SG_INFO("found %d strings\n", num_str);
00594         SG_DEBUG("block_size=%d\n", required_blocksize);
00595         delete[] dummy;
00596         blocksize=required_blocksize;
00597         dummy=new char[blocksize];
00598         overflow=new char[blocksize];
00599         strings=new TString<char>[num_str];
00600 
00601         rewind(file);
00602         sz=blocksize;
00603         int32_t lines=0;
00604         size_t old_sz=0;
00605         while (sz == blocksize)
00606         {
00607             sz=fread(dummy, sizeof(char), blocksize, file);
00608 
00609             old_sz=0;
00610             for (size_t i=0; i<sz; i++)
00611             {
00612                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00613                 {
00614                     int32_t len=i-old_sz;
00615                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00616 
00617                     strings[lines].length=len+overflow_len;
00618                     strings[lines].string=new char[len+overflow_len];
00619 
00620                     for (int32_t j=0; j<overflow_len; j++)
00621                         strings[lines].string[j]=overflow[j];
00622                     for (int32_t j=0; j<len; j++)
00623                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00624 
00625                     // clear overflow
00626                     overflow_len=0;
00627 
00628                     //CMath::display_vector(strings[lines].string, len);
00629                     old_sz=i+1;
00630                     lines++;
00631                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00632                 }
00633             }
00634 
00635             for (size_t i=old_sz; i<sz; i++)
00636                 overflow[i-old_sz]=dummy[i];
00637 
00638             overflow_len=sz-old_sz;
00639         }
00640         SG_INFO("file successfully read\n");
00641         SG_INFO("max_string_length=%d\n", max_string_len);
00642         SG_INFO("num_strings=%d\n", num_str);
00643     }
00644 
00645     delete[] dummy;
00646     delete[] overflow;
00647 }
00648 
00649 void CAsciiFile::get_int_string_list(TString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00650 {
00651     strings=NULL;
00652     num_str=0;
00653     max_string_len=0;
00654 }
00655 
00656 void CAsciiFile::get_uint_string_list(TString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00657 {
00658     strings=NULL;
00659     num_str=0;
00660     max_string_len=0;
00661 }
00662 
00663 void CAsciiFile::get_short_string_list(TString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00664 {
00665     strings=NULL;
00666     num_str=0;
00667     max_string_len=0;
00668 }
00669 
00670 void CAsciiFile::get_word_string_list(TString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00671 {
00672     strings=NULL;
00673     num_str=0;
00674     max_string_len=0;
00675 }
00676 
00677 void CAsciiFile::get_long_string_list(TString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00678 {
00679     strings=NULL;
00680     num_str=0;
00681     max_string_len=0;
00682 }
00683 
00684 void CAsciiFile::get_ulong_string_list(TString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00685 {
00686     strings=NULL;
00687     num_str=0;
00688     max_string_len=0;
00689 }
00690 
00691 void CAsciiFile::get_shortreal_string_list(TString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00692 {
00693     strings=NULL;
00694     num_str=0;
00695     max_string_len=0;
00696 }
00697 
00698 void CAsciiFile::get_real_string_list(TString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00699 {
00700     strings=NULL;
00701     num_str=0;
00702     max_string_len=0;
00703 }
00704 
00705 void CAsciiFile::get_longreal_string_list(TString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len)
00706 {
00707     strings=NULL;
00708     num_str=0;
00709     max_string_len=0;
00710 }
00711 
00712 
00715 #define SET_VECTOR(fname, mfname, sg_type)  \
00716 void CAsciiFile::fname(const sg_type* vec, int32_t len) \
00717 {                                                           \
00718     mfname(vec, len, 1);                                    \
00719 }
00720 SET_VECTOR(set_byte_vector, set_byte_matrix, uint8_t)
00721 SET_VECTOR(set_char_vector, set_char_matrix, char)
00722 SET_VECTOR(set_int_vector, set_int_matrix, int32_t)
00723 SET_VECTOR(set_shortreal_vector, set_shortreal_matrix, float32_t)
00724 SET_VECTOR(set_real_vector, set_real_matrix, float64_t)
00725 SET_VECTOR(set_short_vector, set_short_matrix, int16_t)
00726 SET_VECTOR(set_word_vector, set_word_matrix, uint16_t)
00727 #undef SET_VECTOR
00728 
00729 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \
00730 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec)    \
00731 {                                                                                   \
00732     if (!(file && matrix))                                                          \
00733         SG_ERROR("File or matrix invalid.\n");                                      \
00734                                                                                     \
00735     for (int32_t i=0; i<num_vec; i++)                                               \
00736     {                                                                               \
00737         for (int32_t j=0; j<num_feat; j++)                                          \
00738         {                                                                           \
00739             sg_type v=matrix[num_feat*i+j];                                         \
00740             if (j==num_feat-1)                                                      \
00741                 fprintf(file, type_str "\n", (fprt_type) v);                        \
00742             else                                                                    \
00743                 fprintf(file, type_str " ", (fprt_type) v);                         \
00744         }                                                                           \
00745     }                                                                               \
00746 }
00747 SET_MATRIX(set_char_matrix, char, char, "%c")
00748 SET_MATRIX(set_byte_matrix, uint8_t, uint8_t, "%u")
00749 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d")
00750 SET_MATRIX(set_int_matrix, int32_t, int32_t, "%i")
00751 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u")
00752 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli")
00753 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu")
00754 SET_MATRIX(set_short_matrix, int16_t, int16_t, "%i")
00755 SET_MATRIX(set_word_matrix, uint16_t, uint16_t, "%u")
00756 SET_MATRIX(set_shortreal_matrix, float32_t, float32_t, "%f")
00757 SET_MATRIX(set_real_matrix, float64_t, float64_t, "%f")
00758 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%Lf")
00759 #undef SET_MATRIX
00760 
00761 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \
00762 void CAsciiFile::fname(const TSparse<sg_type>* matrix, int32_t num_feat, int32_t num_vec)   \
00763 {                                                                                           \
00764     if (!(file && matrix))                                                                  \
00765         SG_ERROR("File or matrix invalid.\n");                                              \
00766                                                                                             \
00767     for (int32_t i=0; i<num_vec; i++)                                                       \
00768     {                                                                                       \
00769         TSparseEntry<sg_type>* vec = matrix[i].features;                                    \
00770         int32_t len=matrix[i].num_feat_entries;                                             \
00771                                                                                             \
00772         for (int32_t j=0; j<len; j++)                                                       \
00773         {                                                                                   \
00774             if (j<len-1)                                                                    \
00775             {                                                                               \
00776                 fprintf(file, "%d:" type_str " ",                                           \
00777                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00778             }                                                                               \
00779             else                                                                            \
00780             {                                                                               \
00781                 fprintf(file, "%d:" type_str "\n",                                          \
00782                         (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry);           \
00783             }                                                                               \
00784         }                                                                                   \
00785     }                                                                                       \
00786 }
00787 SET_SPARSEMATRIX(set_bool_sparsematrix, bool, uint8_t, "%u")
00788 SET_SPARSEMATRIX(set_char_sparsematrix, char, char, "%c")
00789 SET_SPARSEMATRIX(set_byte_sparsematrix, uint8_t, uint8_t, "%u")
00790 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d")
00791 SET_SPARSEMATRIX(set_int_sparsematrix, int32_t, int32_t, "%i")
00792 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u")
00793 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli")
00794 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu")
00795 SET_SPARSEMATRIX(set_short_sparsematrix, int16_t, int16_t, "%i")
00796 SET_SPARSEMATRIX(set_word_sparsematrix, uint16_t, uint16_t, "%u")
00797 SET_SPARSEMATRIX(set_shortreal_sparsematrix, float32_t, float32_t, "%f")
00798 SET_SPARSEMATRIX(set_real_sparsematrix, float64_t, float64_t, "%f")
00799 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf")
00800 #undef SET_SPARSEMATRIX
00801 
00802 void CAsciiFile::set_byte_string_list(const TString<uint8_t>* strings, int32_t num_str)
00803 {
00804     if (!(file && strings))
00805         SG_ERROR("File or strings invalid.\n");
00806 
00807     for (int32_t i=0; i<num_str; i++)
00808     {
00809         int32_t len = strings[i].length;
00810         fwrite(strings[i].string, sizeof(uint8_t), len, file);
00811         fprintf(file, "\n");
00812     }
00813 }
00814 
00815 void CAsciiFile::set_int8_string_list(const TString<int8_t>* strings, int32_t num_str)
00816 {
00817     if (!(file && strings))
00818         SG_ERROR("File or strings invalid.\n");
00819 
00820     for (int32_t i=0; i<num_str; i++)
00821     {
00822         int32_t len = strings[i].length;
00823         fwrite(strings[i].string, sizeof(int8_t), len, file);
00824         fprintf(file, "\n");
00825     }
00826 }
00827 
00828 void CAsciiFile::set_char_string_list(const TString<char>* strings, int32_t num_str)
00829 {
00830     if (!(file && strings))
00831         SG_ERROR("File or strings invalid.\n");
00832 
00833     for (int32_t i=0; i<num_str; i++)
00834     {
00835         int32_t len = strings[i].length;
00836         fwrite(strings[i].string, sizeof(char), len, file);
00837         fprintf(file, "\n");
00838     }
00839 }
00840 
00841 void CAsciiFile::set_int_string_list(const TString<int32_t>* strings, int32_t num_str)
00842 {
00843 }
00844 
00845 void CAsciiFile::set_uint_string_list(const TString<uint32_t>* strings, int32_t num_str)
00846 {
00847 }
00848 
00849 void CAsciiFile::set_short_string_list(const TString<int16_t>* strings, int32_t num_str)
00850 {
00851 }
00852 
00853 void CAsciiFile::set_word_string_list(const TString<uint16_t>* strings, int32_t num_str)
00854 {
00855 }
00856 
00857 void CAsciiFile::set_long_string_list(const TString<int64_t>* strings, int32_t num_str)
00858 {
00859 }
00860 
00861 void CAsciiFile::set_ulong_string_list(const TString<uint64_t>* strings, int32_t num_str)
00862 {
00863 }
00864 
00865 void CAsciiFile::set_shortreal_string_list(const TString<float32_t>* strings, int32_t num_str)
00866 {
00867 }
00868 
00869 void CAsciiFile::set_real_string_list(const TString<float64_t>* strings, int32_t num_str)
00870 {
00871 }
00872 
00873 void CAsciiFile::set_longreal_string_list(const TString<floatmax_t>* strings, int32_t num_str)
00874 {
00875 }
00876 
00877 template <class T> void CAsciiFile::append_item(
00878     DynArray<T>* items, char* ptr_data, char* ptr_item)
00879 {
00880     size_t len=(ptr_data-ptr_item)/sizeof(char);
00881     char* item=new char[len+1];
00882     memset(item, 0, sizeof(char)*(len+1));
00883     item=strncpy(item, ptr_item, len);
00884 
00885     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00886     items->append_element(item);
00887 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation