SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HDF5File.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
11 #include <shogun/lib/config.h>
12 
13 #ifdef HAVE_HDF5
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <hdf5.h>
18 
19 #include <shogun/io/HDF5File.h>
20 
23 
24 using namespace shogun;
25 
26 CHDF5File::CHDF5File()
27 {
28  SG_UNSTABLE("CHDF5File::CHDF5File()", "\n");
29 
30  get_boolean_type();
31  h5file = -1;
32 }
33 
34 CHDF5File::CHDF5File(char* fname, char rw, const char* name) : CFile()
35 {
36  get_boolean_type();
37  H5Eset_auto2(H5E_DEFAULT, NULL, NULL);
38 
39  if (name)
40  set_variable_name(name);
41 
42  switch (rw)
43  {
44  case 'r':
45  h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
46  break;
47  case 'w':
48  h5file = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
49  break;
50  case 'a':
51  h5file = H5Fopen(fname, H5F_ACC_RDWR, H5P_DEFAULT);
52  if (h5file <0)
53  h5file = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
54  break;
55  default:
56  SG_ERROR("unknown mode '%c'\n", rw);
57  };
58 
59  if (h5file<0)
60  SG_ERROR("Could not open file '%s'\n", fname);
61 }
62 
63 CHDF5File::~CHDF5File()
64 {
65  H5Fclose(h5file);
66 }
67 
68 #define GET_VECTOR(fname, sg_type, datatype) \
69 void CHDF5File::fname(sg_type*& vec, int32_t& len) \
70 { \
71  if (!h5file) \
72  SG_ERROR("File invalid.\n"); \
73  \
74  int32_t* dims; \
75  int32_t ndims; \
76  int64_t nelements; \
77  hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
78  if (dataset<0) \
79  SG_ERROR("Error opening data set\n"); \
80  hid_t dtype = H5Dget_type(dataset); \
81  H5T_class_t t_class=H5Tget_class(dtype); \
82  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
83  if (h5_type==-1) \
84  { \
85  H5Dclose(dataset); \
86  SG_INFO("No compatible datatype found\n"); \
87  } \
88  get_dims(dataset, dims, ndims, nelements); \
89  if (!((ndims==2 && dims[0]==nelements && dims[1]==1) || \
90  (ndims==2 && dims[0]==1 && dims[1]==nelements) || \
91  (ndims==1 && dims[0]==nelements))) \
92  SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0]); \
93  vec=SG_MALLOC(sg_type, nelements); \
94  len=nelements; \
95  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
96  H5S_ALL, H5P_DEFAULT, vec); \
97  H5Dclose(dataset); \
98  H5Tclose(dtype); \
99  SG_FREE(dims); \
100  if (status<0) \
101  { \
102  SG_FREE(vec); \
103  SG_ERROR("Error reading dataset\n"); \
104  } \
105 }
106 
107 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL))
108 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
109 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
110 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
111 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
112 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
113 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
114 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
115 #undef GET_VECTOR
116 
117 #define GET_MATRIX(fname, sg_type, datatype) \
118 void CHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
119 { \
120  if (!h5file) \
121  SG_ERROR("File invalid.\n"); \
122  \
123  int32_t* dims; \
124  int32_t ndims; \
125  int64_t nelements; \
126  hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
127  if (dataset<0) \
128  SG_ERROR("Error opening data set\n"); \
129  hid_t dtype = H5Dget_type(dataset); \
130  H5T_class_t t_class=H5Tget_class(dtype); \
131  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
132  if (h5_type==-1) \
133  { \
134  H5Dclose(dataset); \
135  SG_INFO("No compatible datatype found\n"); \
136  } \
137  get_dims(dataset, dims, ndims, nelements); \
138  if (ndims!=2) \
139  SG_ERROR("Error not a 2-dimensional matrix\n"); \
140  matrix=SG_MALLOC(sg_type, nelements); \
141  num_feat=dims[0]; \
142  num_vec=dims[1]; \
143  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
144  H5S_ALL, H5P_DEFAULT, matrix); \
145  H5Dclose(dataset); \
146  H5Tclose(dtype); \
147  SG_FREE(dims); \
148  if (status<0) \
149  { \
150  SG_FREE(matrix); \
151  SG_ERROR("Error reading dataset\n"); \
152  } \
153 }
154 
155 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
156 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
157 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
158 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
159 GET_MATRIX(get_uint_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
160 GET_MATRIX(get_long_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
161 GET_MATRIX(get_ulong_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
162 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
163 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
164 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
165 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
166 GET_MATRIX(get_longreal_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
167 #undef GET_MATRIX
168 
169 void CHDF5File::get_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
170 {
171 }
172 
173 void CHDF5File::get_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
174 {
175 }
176 
177 void CHDF5File::get_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
178 {
179 }
180 
181 void CHDF5File::get_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
182 {
183 }
184 
185 void CHDF5File::get_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
186 {
187 }
188 
189 void CHDF5File::get_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
190 {
191 }
192 
193 void CHDF5File::get_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
194 {
195 }
196 
197 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
198 void CHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
199 { \
200  if (!(file)) \
201  SG_ERROR("File invalid.\n"); \
202 }
203 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL)
204 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR)
205 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
206 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT)
207 GET_SPARSEMATRIX(get_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
208 GET_SPARSEMATRIX(get_long_sparsematrix, int64_t, DT_SPARSE_LONG)
209 GET_SPARSEMATRIX(get_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
210 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT)
211 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD)
212 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
213 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL)
214 GET_SPARSEMATRIX(get_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
215 #undef GET_SPARSEMATRIX
216 
217 
218 #define GET_STRING_LIST(fname, sg_type, datatype) \
219 void CHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
220 { \
221 }
222 
223 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL)
224 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR)
225 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE)
226 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT)
227 GET_STRING_LIST(get_uint_string_list, uint32_t, DT_STRING_UINT)
228 GET_STRING_LIST(get_long_string_list, int64_t, DT_STRING_LONG)
229 GET_STRING_LIST(get_ulong_string_list, uint64_t, DT_STRING_ULONG)
230 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT)
231 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD)
232 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL)
233 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL)
234 GET_STRING_LIST(get_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
235 #undef GET_STRING_LIST
236 
239 #define SET_VECTOR(fname, sg_type, dtype, h5type) \
240 void CHDF5File::fname(const sg_type* vec, int32_t len) \
241 { \
242  if (h5file<0 || !vec) \
243  SG_ERROR("File or vector invalid.\n"); \
244  \
245  create_group_hierarchy(); \
246  \
247  hsize_t dims=(hsize_t) len; \
248  hid_t dataspace, dataset, status; \
249  dataspace=H5Screate_simple(1, &dims, NULL); \
250  if (dataspace<0) \
251  SG_ERROR("Could not create hdf5 dataspace\n"); \
252  dataset=H5Dcreate2(h5file, variable_name, h5type, dataspace, H5P_DEFAULT,\
253  H5P_DEFAULT, H5P_DEFAULT); \
254  if (dataset<0) \
255  { \
256  SG_ERROR("Could not create hdf5 dataset - does" \
257  " dataset '%s' already exist?\n", variable_name); \
258  } \
259  status=H5Dwrite(dataset, h5type, H5S_ALL, H5S_ALL, H5P_DEFAULT, vec); \
260  if (status<0) \
261  SG_ERROR("Failed to write hdf5 dataset\n"); \
262  H5Dclose(dataset); \
263  H5Sclose(dataspace); \
264 }
265 SET_VECTOR(set_vector, bool, DT_VECTOR_BOOL, boolean_type)
266 SET_VECTOR(set_vector, uint8_t, DT_VECTOR_BYTE, H5T_NATIVE_UINT8)
267 SET_VECTOR(set_vector, char, DT_VECTOR_CHAR, H5T_NATIVE_CHAR)
268 SET_VECTOR(set_vector, int32_t, DT_VECTOR_INT, H5T_NATIVE_INT32)
269 SET_VECTOR(set_vector, float32_t, DT_VECTOR_SHORTREAL, H5T_NATIVE_FLOAT)
270 SET_VECTOR(set_vector, float64_t, DT_VECTOR_REAL, H5T_NATIVE_DOUBLE)
271 SET_VECTOR(set_vector, int16_t, DT_VECTOR_SHORT, H5T_NATIVE_INT16)
272 SET_VECTOR(set_vector, uint16_t, DT_VECTOR_WORD, H5T_NATIVE_UINT16)
273 #undef SET_VECTOR
274 
275 #define SET_MATRIX(fname, sg_type, dtype, h5type) \
276 void CHDF5File::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
277 { \
278  if (h5file<0 || !matrix) \
279  SG_ERROR("File or matrix invalid.\n"); \
280  \
281  create_group_hierarchy(); \
282  \
283  hsize_t dims[2]={(hsize_t) num_feat, (hsize_t) num_vec}; \
284  hid_t dataspace, dataset, status; \
285  dataspace=H5Screate_simple(2, dims, NULL); \
286  if (dataspace<0) \
287  SG_ERROR("Could not create hdf5 dataspace\n"); \
288  dataset=H5Dcreate2(h5file, variable_name, h5type, dataspace, H5P_DEFAULT, \
289  H5P_DEFAULT, H5P_DEFAULT); \
290  if (dataset<0) \
291  { \
292  SG_ERROR("Could not create hdf5 dataset - does" \
293  " dataset '%s' already exist?\n", variable_name); \
294  } \
295  status=H5Dwrite(dataset, h5type, H5S_ALL, H5S_ALL, H5P_DEFAULT, matrix); \
296  if (status<0) \
297  SG_ERROR("Failed to write hdf5 dataset\n"); \
298  H5Dclose(dataset); \
299  H5Sclose(dataspace); \
300 }
301 SET_MATRIX(set_matrix, bool, DT_DENSE_BOOL, boolean_type)
302 SET_MATRIX(set_matrix, char, DT_DENSE_CHAR, H5T_NATIVE_CHAR)
303 SET_MATRIX(set_matrix, uint8_t, DT_DENSE_BYTE, H5T_NATIVE_UINT8)
304 SET_MATRIX(set_matrix, int32_t, DT_DENSE_INT, H5T_NATIVE_INT32)
305 SET_MATRIX(set_uint_matrix, uint32_t, DT_DENSE_UINT, H5T_NATIVE_UINT32)
306 SET_MATRIX(set_long_matrix, int64_t, DT_DENSE_LONG, H5T_NATIVE_INT64)
307 SET_MATRIX(set_ulong_matrix, uint64_t, DT_DENSE_ULONG, H5T_NATIVE_UINT64)
308 SET_MATRIX(set_matrix, int16_t, DT_DENSE_SHORT, H5T_NATIVE_INT16)
309 SET_MATRIX(set_matrix, uint16_t, DT_DENSE_WORD, H5T_NATIVE_UINT16)
310 SET_MATRIX(set_matrix, float32_t, DT_DENSE_SHORTREAL, H5T_NATIVE_FLOAT)
311 SET_MATRIX(set_matrix, float64_t, DT_DENSE_REAL, H5T_NATIVE_DOUBLE)
312 SET_MATRIX(set_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL, H5T_NATIVE_LDOUBLE)
313 #undef SET_MATRIX
314 
315 #define SET_SPARSEMATRIX(fname, sg_type, dtype) \
316 void CHDF5File::fname(const SGSparseVector<sg_type>* matrix, \
317  int32_t num_feat, int32_t num_vec) \
318 { \
319  if (!(file && matrix)) \
320  SG_ERROR("File or matrix invalid.\n"); \
321  \
322 }
323 SET_SPARSEMATRIX(set_sparse_matrix, bool, DT_SPARSE_BOOL)
324 SET_SPARSEMATRIX(set_sparse_matrix, char, DT_SPARSE_CHAR)
325 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
326 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, DT_SPARSE_INT)
327 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
328 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, DT_SPARSE_LONG)
329 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
330 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, DT_SPARSE_SHORT)
331 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, DT_SPARSE_WORD)
332 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
333 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, DT_SPARSE_REAL)
334 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
335 #undef SET_SPARSEMATRIX
336 
337 #define SET_STRING_LIST(fname, sg_type, dtype) \
338 void CHDF5File::fname(const SGString<sg_type>* strings, int32_t num_str) \
339 { \
340  if (!(file && strings)) \
341  SG_ERROR("File or strings invalid.\n"); \
342  \
343 }
344 SET_STRING_LIST(set_string_list, bool, DT_STRING_BOOL)
345 SET_STRING_LIST(set_string_list, char, DT_STRING_CHAR)
346 SET_STRING_LIST(set_string_list, uint8_t, DT_STRING_BYTE)
347 SET_STRING_LIST(set_string_list, int32_t, DT_STRING_INT)
348 SET_STRING_LIST(set_uint_string_list, uint32_t, DT_STRING_UINT)
349 SET_STRING_LIST(set_long_string_list, int64_t, DT_STRING_LONG)
350 SET_STRING_LIST(set_ulong_string_list, uint64_t, DT_STRING_ULONG)
351 SET_STRING_LIST(set_string_list, int16_t, DT_STRING_SHORT)
352 SET_STRING_LIST(set_string_list, uint16_t, DT_STRING_WORD)
353 SET_STRING_LIST(set_string_list, float32_t, DT_STRING_SHORTREAL)
354 SET_STRING_LIST(set_string_list, float64_t, DT_STRING_REAL)
355 SET_STRING_LIST(set_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
356 #undef SET_STRING_LIST
357 
358 void CHDF5File::get_boolean_type()
359 {
360  boolean_type=H5T_NATIVE_UCHAR;
361  switch (sizeof(bool))
362  {
363  case 1:
364  boolean_type = H5T_NATIVE_UCHAR;
365  break;
366  case 2:
367  boolean_type = H5T_NATIVE_UINT16;
368  break;
369  case 4:
370  boolean_type = H5T_NATIVE_UINT32;
371  break;
372  case 8:
373  boolean_type = H5T_NATIVE_UINT64;
374  break;
375  default:
376  SG_ERROR("Boolean type not supported on this platform\n");
377  }
378 }
379 
380 hid_t CHDF5File::get_compatible_type(H5T_class_t t_class,
381  const TSGDataType* datatype)
382 {
383  switch (t_class)
384  {
385  case H5T_FLOAT:
386  case H5T_INTEGER:
387  switch (datatype->m_ptype)
388  {
389  case PT_BOOL: return boolean_type;
390  case PT_CHAR: return H5T_NATIVE_CHAR;
391  case PT_INT8: return H5T_NATIVE_INT8;
392  case PT_UINT8: return H5T_NATIVE_UINT8;
393  case PT_INT16: return H5T_NATIVE_INT16;
394  case PT_UINT16: return H5T_NATIVE_UINT16;
395  case PT_INT32: return H5T_NATIVE_INT32;
396  case PT_UINT32: return H5T_NATIVE_UINT32;
397  case PT_INT64: return H5T_NATIVE_INT64;
398  case PT_UINT64: return H5T_NATIVE_UINT64;
399  case PT_FLOAT32: return H5T_NATIVE_FLOAT;
400  case PT_FLOAT64: return H5T_NATIVE_DOUBLE;
401  case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE;
402  case PT_SGOBJECT:
403  SG_ERROR("Implementation error during writing "
404  "HDF5File!");
405  return -1;
406  }
407  case H5T_STRING:
408  SG_ERROR("Strings not supported");
409  return -1;
410  case H5T_VLEN:
411  SG_ERROR("Variable length containers currently not supported");
412  return -1;
413  case H5T_ARRAY:
414  SG_ERROR("Array containers currently not supported");
415  return -1;
416  default:
417  SG_ERROR("Datatype mismatchn");
418  return -1;
419  }
420 }
421 
422 void CHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements)
423 {
424  hid_t dataspace = H5Dget_space(dataset);
425  if (dataspace<0)
426  SG_ERROR("Error obtaining hdf5 dataspace\n");
427 
428  ndims = H5Sget_simple_extent_ndims(dataspace);
429  total_elements=H5Sget_simple_extent_npoints(dataspace);
430  hsize_t* dims_out=SG_MALLOC(hsize_t, ndims);
431  dims=SG_MALLOC(int32_t, ndims);
432  H5Sget_simple_extent_dims(dataspace, dims_out, NULL);
433  for (int32_t i=0; i<ndims; i++)
434  dims[i]=dims_out[i];
435  SG_FREE(dims_out);
436  H5Sclose(dataspace);
437 }
438 
439 void CHDF5File::create_group_hierarchy()
440 {
441  char* vname=strdup(variable_name);
442  int32_t vlen=strlen(vname);
443  for (int32_t i=0; i<vlen; i++)
444  {
445  if (i!=0 && vname[i]=='/')
446  {
447  vname[i]='\0';
448  hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT);
449  if (g<0)
450  {
451  g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT,
452  H5P_DEFAULT);
453  if (g<0)
454  SG_ERROR("Error creating group '%s'\n", vname);
455  vname[i]='/';
456  }
457  H5Gclose(g);
458  }
459  }
460  SG_FREE(vname);
461 }
462 #endif // HDF5

SHOGUN Machine Learning Toolbox - Documentation