00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/File.h"
00012 #include "features/SparseFeatures.h"
00013 #include "lib/BinaryFile.h"
00014
00015 using namespace shogun;
00016
00017 CBinaryFile::CBinaryFile(void)
00018 {
00019 SG_UNSTABLE("CBinaryFile::CBinaryFile(void)", "\n");
00020 }
00021
00022 CBinaryFile::CBinaryFile(FILE* f, const char* name) : CFile(f, name)
00023 {
00024 }
00025
00026 CBinaryFile::CBinaryFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00027 {
00028 }
00029
00030 CBinaryFile::~CBinaryFile()
00031 {
00032 }
00033
00034 #define GET_VECTOR(fname, sg_type, datatype) \
00035 void CBinaryFile::fname(sg_type*& vec, int32_t& len) \
00036 { \
00037 if (!file) \
00038 SG_ERROR("File invalid.\n"); \
00039 TSGDataType dtype(CT_SCALAR, ST_NONE, PT_BOOL); read_header(&dtype); \
00040 if (dtype!=datatype) \
00041 SG_ERROR("Datatype mismatch\n"); \
00042 \
00043 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00044 SG_ERROR("Failed to read vector length\n"); \
00045 vec=new sg_type[len]; \
00046 if (fread(vec, sizeof(sg_type), len, file)!=(size_t) len) \
00047 SG_ERROR("Failed to read Matrix\n"); \
00048 }
00049
00050 GET_VECTOR(get_byte_vector, uint8_t, TSGDataType(CT_VECTOR, ST_NONE, PT_UINT8))
00051 GET_VECTOR(get_char_vector, char, TSGDataType(CT_VECTOR, ST_NONE, PT_CHAR))
00052 GET_VECTOR(get_int_vector, int32_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT32))
00053 GET_VECTOR(get_shortreal_vector, float32_t, TSGDataType(CT_VECTOR, ST_NONE, PT_FLOAT32))
00054 GET_VECTOR(get_real_vector, float64_t, TSGDataType(CT_VECTOR, ST_NONE, PT_FLOAT64))
00055 GET_VECTOR(get_short_vector, int16_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT16))
00056 GET_VECTOR(get_word_vector, uint16_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT16))
00057 #undef GET_VECTOR
00058
00059 #define GET_MATRIX(fname, sg_type, datatype) \
00060 void CBinaryFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
00061 { \
00062 if (!file) \
00063 SG_ERROR("File invalid.\n"); \
00064 TSGDataType dtype(CT_SCALAR, ST_NONE, PT_BOOL); read_header(&dtype); \
00065 if (dtype!=datatype) \
00066 SG_ERROR("Datatype mismatch\n"); \
00067 \
00068 if (fread(&num_feat, sizeof(int32_t), 1, file)!=1 || \
00069 fread(&num_vec, sizeof(int32_t), 1, file)!=1) \
00070 SG_ERROR("Failed to read Matrix dimensions\n"); \
00071 matrix=new sg_type[int64_t(num_feat)*num_vec]; \
00072 if (fread(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \
00073 SG_ERROR("Failed to read Matrix\n"); \
00074 }
00075
00076 GET_MATRIX(get_char_matrix, char, TSGDataType(CT_MATRIX, ST_NONE, PT_CHAR))
00077 GET_MATRIX(get_byte_matrix, uint8_t, TSGDataType(CT_MATRIX, ST_NONE, PT_UINT8))
00078 GET_MATRIX(get_int8_matrix, int8_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT8))
00079 GET_MATRIX(get_int_matrix, int32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT32))
00080 GET_MATRIX(get_uint_matrix, uint32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT32))
00081 GET_MATRIX(get_long_matrix, int64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT64))
00082 GET_MATRIX(get_ulong_matrix, uint64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT64))
00083 GET_MATRIX(get_short_matrix, int16_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT16))
00084 GET_MATRIX(get_word_matrix, uint16_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT16))
00085 GET_MATRIX(get_shortreal_matrix, float32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOAT32))
00086 GET_MATRIX(get_real_matrix, float64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOAT64))
00087 GET_MATRIX(get_longreal_matrix, floatmax_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOATMAX))
00088 #undef GET_MATRIX
00089
00090 void CBinaryFile::get_byte_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
00091 {
00092 }
00093
00094 void CBinaryFile::get_char_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
00095 {
00096 }
00097
00098 void CBinaryFile::get_int_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
00099 {
00100 }
00101
00102 void CBinaryFile::get_shortreal_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
00103 {
00104 }
00105
00106 void CBinaryFile::get_real_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
00107 {
00108 }
00109
00110 void CBinaryFile::get_short_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
00111 {
00112 }
00113
00114 void CBinaryFile::get_word_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
00115 {
00116 }
00117
00118 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
00119 void CBinaryFile::fname(TSparse<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00120 { \
00121 if (!(file)) \
00122 SG_ERROR("File invalid.\n"); \
00123 \
00124 TSGDataType dtype(CT_SCALAR, ST_NONE, PT_BOOL); read_header(&dtype); \
00125 if (dtype!=datatype) \
00126 SG_ERROR("Datatype mismatch\n"); \
00127 \
00128 if (fread(&num_vec, sizeof(int32_t), 1, file)!=1) \
00129 SG_ERROR("Failed to read number of vectors\n"); \
00130 \
00131 matrix=new TSparse<sg_type>[num_vec]; \
00132 \
00133 for (int32_t i=0; i<num_vec; i++) \
00134 { \
00135 int32_t len=0; \
00136 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00137 SG_ERROR("Failed to read sparse vector length of vector idx=%d\n", i); \
00138 matrix[i].num_feat_entries=len; \
00139 TSparseEntry<sg_type>* vec = new TSparseEntry<sg_type>[len]; \
00140 if (fread(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len) \
00141 SG_ERROR("Failed to read sparse vector %d\n", i); \
00142 matrix[i].features=vec; \
00143 } \
00144 }
00145 GET_SPARSEMATRIX(get_bool_sparsematrix, bool, TSGDataType(CT_MATRIX, ST_NONE, PT_BOOL))
00146 GET_SPARSEMATRIX(get_char_sparsematrix, char, TSGDataType(CT_MATRIX, ST_NONE, PT_CHAR))
00147 GET_SPARSEMATRIX(get_byte_sparsematrix, uint8_t, TSGDataType(CT_MATRIX, ST_NONE, PT_UINT8))
00148 GET_SPARSEMATRIX(get_int8_sparsematrix, int8_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT8))
00149 GET_SPARSEMATRIX(get_int_sparsematrix, int32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT32))
00150 GET_SPARSEMATRIX(get_uint_sparsematrix, uint32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT32))
00151 GET_SPARSEMATRIX(get_long_sparsematrix, int64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT64))
00152 GET_SPARSEMATRIX(get_ulong_sparsematrix, uint64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT64))
00153 GET_SPARSEMATRIX(get_short_sparsematrix, int16_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT16))
00154 GET_SPARSEMATRIX(get_word_sparsematrix, uint16_t, TSGDataType(CT_MATRIX, ST_NONE, PT_INT16))
00155 GET_SPARSEMATRIX(get_shortreal_sparsematrix, float32_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOAT32))
00156 GET_SPARSEMATRIX(get_real_sparsematrix, float64_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOAT64))
00157 GET_SPARSEMATRIX(get_longreal_sparsematrix, floatmax_t, TSGDataType(CT_MATRIX, ST_NONE, PT_FLOATMAX))
00158 #undef GET_SPARSEMATRIX
00159
00160
00161 #define GET_STRING_LIST(fname, sg_type, datatype) \
00162 void CBinaryFile::fname(TString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
00163 { \
00164 strings=NULL; \
00165 num_str=0; \
00166 max_string_len=0; \
00167 \
00168 if (!file) \
00169 SG_ERROR("File invalid.\n"); \
00170 \
00171 TSGDataType dtype(CT_SCALAR, ST_NONE, PT_BOOL); read_header(&dtype); \
00172 if (dtype!=datatype) \
00173 SG_ERROR("Datatype mismatch\n"); \
00174 \
00175 if (fread(&num_str, sizeof(int32_t), 1, file)!=1) \
00176 SG_ERROR("Failed to read number of strings\n"); \
00177 \
00178 strings=new TString<sg_type>[num_str]; \
00179 \
00180 for (int32_t i=0; i<num_str; i++) \
00181 { \
00182 int32_t len=0; \
00183 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00184 SG_ERROR("Failed to read string length of string with idx=%d\n", i); \
00185 strings[i].length=len; \
00186 sg_type* str = new sg_type[len]; \
00187 if (fread(str, sizeof(sg_type), len, file)!= (size_t) len) \
00188 SG_ERROR("Failed to read string %d\n", i); \
00189 strings[i].string=str; \
00190 } \
00191 }
00192
00193 GET_STRING_LIST(get_char_string_list, char, TSGDataType(CT_VECTOR, ST_NONE, PT_CHAR))
00194 GET_STRING_LIST(get_byte_string_list, uint8_t, TSGDataType(CT_VECTOR, ST_NONE, PT_UINT8))
00195 GET_STRING_LIST(get_int8_string_list, int8_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT8))
00196 GET_STRING_LIST(get_int_string_list, int32_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT32))
00197 GET_STRING_LIST(get_uint_string_list, uint32_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT32))
00198 GET_STRING_LIST(get_long_string_list, int64_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT64))
00199 GET_STRING_LIST(get_ulong_string_list, uint64_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT64))
00200 GET_STRING_LIST(get_short_string_list, int16_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT16))
00201 GET_STRING_LIST(get_word_string_list, uint16_t, TSGDataType(CT_VECTOR, ST_NONE, PT_INT16))
00202 GET_STRING_LIST(get_shortreal_string_list, float32_t, TSGDataType(CT_VECTOR, ST_NONE, PT_FLOAT32))
00203 GET_STRING_LIST(get_real_string_list, float64_t, TSGDataType(CT_VECTOR, ST_NONE, PT_FLOAT64))
00204 GET_STRING_LIST(get_longreal_string_list, floatmax_t, TSGDataType(CT_VECTOR, ST_NONE, PT_FLOATMAX))
00205 #undef GET_STRING_LIST
00206
00209 #define SET_VECTOR(fname, sg_type, dtype) \
00210 void CBinaryFile::fname(const sg_type* vec, int32_t len) \
00211 { \
00212 if (!(file && vec)) \
00213 SG_ERROR("File or vector invalid.\n"); \
00214 \
00215 TSGDataType t dtype; write_header(&t); \
00216 \
00217 if (fwrite(&len, sizeof(int32_t), 1, file)!=1 || \
00218 fwrite(vec, sizeof(sg_type), len, file)!=(size_t) len) \
00219 SG_ERROR("Failed to write vector\n"); \
00220 }
00221 SET_VECTOR(set_byte_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
00222 SET_VECTOR(set_char_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
00223 SET_VECTOR(set_int_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
00224 SET_VECTOR(set_shortreal_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
00225 SET_VECTOR(set_real_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
00226 SET_VECTOR(set_short_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00227 SET_VECTOR(set_word_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00228 #undef SET_VECTOR
00229
00230 #define SET_MATRIX(fname, sg_type, dtype) \
00231 void CBinaryFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
00232 { \
00233 if (!(file && matrix)) \
00234 SG_ERROR("File or matrix invalid.\n"); \
00235 \
00236 TSGDataType t dtype; write_header(&t); \
00237 \
00238 if (fwrite(&num_feat, sizeof(int32_t), 1, file)!=1 || \
00239 fwrite(&num_vec, sizeof(int32_t), 1, file)!=1 || \
00240 fwrite(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \
00241 SG_ERROR("Failed to write Matrix\n"); \
00242 }
00243 SET_MATRIX(set_char_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
00244 SET_MATRIX(set_byte_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
00245 SET_MATRIX(set_int8_matrix, int8_t, (CT_MATRIX, ST_NONE, PT_INT8))
00246 SET_MATRIX(set_int_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00247 SET_MATRIX(set_uint_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00248 SET_MATRIX(set_long_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00249 SET_MATRIX(set_ulong_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00250 SET_MATRIX(set_short_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00251 SET_MATRIX(set_word_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00252 SET_MATRIX(set_shortreal_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
00253 SET_MATRIX(set_real_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
00254 SET_MATRIX(set_longreal_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
00255 #undef SET_MATRIX
00256
00257 #define SET_SPARSEMATRIX(fname, sg_type, dtype) \
00258 void CBinaryFile::fname(const TSparse<sg_type>* matrix, \
00259 int32_t num_feat, int32_t num_vec) \
00260 { \
00261 if (!(file && matrix)) \
00262 SG_ERROR("File or matrix invalid.\n"); \
00263 \
00264 TSGDataType t dtype; write_header(&t); \
00265 \
00266 if (fwrite(&num_vec, sizeof(int32_t), 1, file)!=1) \
00267 SG_ERROR("Failed to write Sparse Matrix\n"); \
00268 \
00269 for (int32_t i=0; i<num_vec; i++) \
00270 { \
00271 TSparseEntry<sg_type>* vec = matrix[i].features; \
00272 int32_t len=matrix[i].num_feat_entries; \
00273 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \
00274 (fwrite(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len)) \
00275 SG_ERROR("Failed to write Sparse Matrix\n"); \
00276 } \
00277 }
00278 SET_SPARSEMATRIX(set_bool_sparsematrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
00279 SET_SPARSEMATRIX(set_char_sparsematrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
00280 SET_SPARSEMATRIX(set_byte_sparsematrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
00281 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, (CT_MATRIX, ST_NONE, PT_INT8))
00282 SET_SPARSEMATRIX(set_int_sparsematrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00283 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
00284 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00285 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
00286 SET_SPARSEMATRIX(set_short_sparsematrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00287 SET_SPARSEMATRIX(set_word_sparsematrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
00288 SET_SPARSEMATRIX(set_shortreal_sparsematrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
00289 SET_SPARSEMATRIX(set_real_sparsematrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
00290 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
00291 #undef SET_SPARSEMATRIX
00292
00293 #define SET_STRING_LIST(fname, sg_type, dtype) \
00294 void CBinaryFile::fname(const TString<sg_type>* strings, int32_t num_str) \
00295 { \
00296 if (!(file && strings)) \
00297 SG_ERROR("File or strings invalid.\n"); \
00298 \
00299 TSGDataType t dtype; write_header(&t); \
00300 for (int32_t i=0; i<num_str; i++) \
00301 { \
00302 int32_t len = strings[i].length; \
00303 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \
00304 (fwrite(strings[i].string, sizeof(sg_type), len, file)!= (size_t) len)) \
00305 SG_ERROR("Failed to write Sparse Matrix\n"); \
00306 } \
00307 }
00308 SET_STRING_LIST(set_char_string_list, char, (CT_VECTOR, ST_NONE, PT_CHAR))
00309 SET_STRING_LIST(set_byte_string_list, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
00310 SET_STRING_LIST(set_int8_string_list, int8_t, (CT_VECTOR, ST_NONE, PT_INT8))
00311 SET_STRING_LIST(set_int_string_list, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
00312 SET_STRING_LIST(set_uint_string_list, uint32_t, (CT_VECTOR, ST_NONE, PT_INT32))
00313 SET_STRING_LIST(set_long_string_list, int64_t, (CT_VECTOR, ST_NONE, PT_INT64))
00314 SET_STRING_LIST(set_ulong_string_list, uint64_t, (CT_VECTOR, ST_NONE, PT_INT64))
00315 SET_STRING_LIST(set_short_string_list, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00316 SET_STRING_LIST(set_word_string_list, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
00317 SET_STRING_LIST(set_shortreal_string_list, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
00318 SET_STRING_LIST(set_real_string_list, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
00319 SET_STRING_LIST(set_longreal_string_list, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX))
00320 #undef SET_STRING_LIST
00321
00322
00323 int32_t CBinaryFile::parse_first_header(TSGDataType& type)
00324 {
00325 return -1;
00326 }
00327
00328 int32_t CBinaryFile::parse_next_header(TSGDataType& type)
00329 {
00330 return -1;
00331 }
00332
00333 void
00334 CBinaryFile::read_header(TSGDataType* dest)
00335 {
00336 ASSERT(file);
00337
00338 char fourcc[4];
00339 uint16_t endian=0;
00340
00341 if (!((fread(&fourcc, sizeof(char), 4, file)==4) &&
00342 (fread(&endian, sizeof(uint16_t), 1, file)== 1) &&
00343 (fread(&dest->m_ctype, sizeof(dest->m_ctype), 1, file)== 1)
00344 && (fread(&dest->m_ptype, sizeof(dest->m_ptype), 1, file)== 1)
00345 ))
00346 SG_ERROR("Error reading header\n");
00347
00348 if (strncmp(fourcc, "SG01", 4))
00349 SG_ERROR("Header mismatch, expected SG01\n");
00350 }
00351
00352 void
00353 CBinaryFile::write_header(const TSGDataType* datatype)
00354 {
00355 ASSERT(file);
00356
00357 const char* fourcc="SG01";
00358 uint16_t endian=0x1234;
00359
00360 if (!((fwrite(fourcc, sizeof(char), 4, file)==4) &&
00361 (fwrite(&endian, sizeof(uint16_t), 1, file)==1) &&
00362 (fwrite(&datatype->m_ctype, sizeof(datatype->m_ctype), 1,
00363 file)==1)
00364 && (fwrite(&datatype->m_ptype, sizeof(datatype->m_ptype), 1,
00365 file)==1)
00366 ))
00367 SG_ERROR("Error writing header\n");
00368 }