SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLDataHDF5File.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Copyright (C) 2013 Zhengyang Liu (zhengyangl)
8  */
9 
10 #include <shogun/lib/config.h>
11 
12 #if defined(HAVE_HDF5) && defined( HAVE_CURL)
13 
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <hdf5.h>
18 #include <curl/curl.h>
19 #include <shogun/lib/memory.h>
21 
24 
25 using namespace shogun;
26 
27 CMLDataHDF5File::CMLDataHDF5File()
28 {
29  SG_UNSTABLE("CMLDataHDF5File::CMLDataHDF5File()", "\n")
30 
31  get_boolean_type();
32  h5file = -1;
33 }
34 
35 size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
36  size_t written = fwrite(ptr, size, nmemb, stream);
37  return written;
38 }
39 
40 CMLDataHDF5File::CMLDataHDF5File(char* data_name,
41  const char* name,
42  const char* url_prefix) : CFile()
43 {
44  get_boolean_type();
45  H5Eset_auto2(H5E_DEFAULT, NULL, NULL);
46 
47  if (name)
48  set_variable_name(name);
49 
50  CURL *curl;
51  FILE *fp=NULL;
52 
53  mldata_url = SG_CALLOC(char, strlen(url_prefix)+strlen(data_name)+1);
54  strcat(mldata_url, url_prefix);
55  strcat(mldata_url, data_name);
56 
57  fname = SG_CALLOC(char, strlen((char*)"/tmp/")+strlen(data_name)+strlen((char*)".h5")+1);
58  strcat(fname, (char*) "/tmp/");
59  strcat(fname, data_name);
60  strcat(fname, (char*) ".h5");
61 
62  curl = curl_easy_init();
63  fp = fopen(fname,"wb");
64 
65  if (!fp)
66  {
67  SG_ERROR("Could not open file '%s'\n", fname)
68  return;
69  }
70 
71  if (curl) {
72  curl_easy_setopt(curl, CURLOPT_URL, mldata_url);
73  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data);
74  curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
75  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
76  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
77  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
78  curl_easy_perform(curl);
79  curl_easy_cleanup(curl);
80  }
81 
82  if(fp)
83  fclose(fp);
84 
85  h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
86 
87  if (h5file<0)
88  SG_ERROR("Could not open data repository '%s'\n", data_name)
89 }
90 
91 CMLDataHDF5File::~CMLDataHDF5File()
92 {
93  H5Fclose(h5file);
94  remove(fname);
95  SG_FREE(fname);
96  SG_FREE(mldata_url);
97 }
98 
99 #define GET_VECTOR(fname, sg_type, datatype) \
100 void CMLDataHDF5File::fname(sg_type*& vec, int32_t& len) \
101 { \
102  if (!h5file) \
103  SG_ERROR("File invalid.\n") \
104  \
105  int32_t* dims; \
106  int32_t ndims; \
107  int64_t nelements; \
108  hid_t dataset=H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
109  if (dataset<0) \
110  SG_ERROR("Error opening data set\n") \
111  hid_t dtype=H5Dget_type(dataset); \
112  H5T_class_t t_class=H5Tget_class(dtype); \
113  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
114  if (h5_type==-1) \
115  { \
116  H5Dclose(dataset); \
117  SG_INFO("No compatible datatype found\n") \
118  } \
119  get_dims(dataset, dims, ndims, nelements); \
120  if (!((ndims==2 && dims[0]==nelements && dims[1]==1) || \
121  (ndims==2 && dims[0]==1 && dims[1]==nelements) || \
122  (ndims==1 && dims[0]==nelements))) \
123  SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0]) \
124  vec=SG_MALLOC(sg_type, nelements); \
125  len=nelements; \
126  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
127  H5S_ALL, H5P_DEFAULT, vec); \
128  H5Dclose(dataset); \
129  H5Tclose(dtype); \
130  SG_FREE(dims); \
131  if (status<0) \
132  { \
133  SG_FREE(vec); \
134  SG_ERROR("Error reading dataset\n") \
135  } \
136 }
137 
138 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL))
139 GET_VECTOR(get_vector, int8_t, (CT_VECTOR, ST_NONE, PT_INT8))
140 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
141 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
142 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
143 GET_VECTOR(get_vector, uint32_t, (CT_VECTOR, ST_NONE, PT_UINT32))
144 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
145 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
146 GET_VECTOR(get_vector, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX))
147 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
148 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
149 GET_VECTOR(get_vector, int64_t, (CT_VECTOR, ST_NONE, PT_INT64))
150 GET_VECTOR(get_vector, uint64_t, (CT_VECTOR, ST_NONE, PT_UINT64))
151 #undef GET_VECTOR
152 
153 #define GET_MATRIX(fname, sg_type, datatype) \
154 void CMLDataHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
155 { \
156  if (!h5file) \
157  SG_ERROR("File invalid.\n") \
158  \
159  int32_t* dims; \
160  int32_t ndims; \
161  int64_t nelements; \
162  hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
163  if (dataset<0) \
164  SG_ERROR("Error opening data set\n") \
165  hid_t dtype = H5Dget_type(dataset); \
166  H5T_class_t t_class=H5Tget_class(dtype); \
167  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
168  if (h5_type==-1) \
169  { \
170  H5Dclose(dataset); \
171  SG_INFO("No compatible datatype found\n") \
172  } \
173  get_dims(dataset, dims, ndims, nelements); \
174  if (ndims!=2) \
175  SG_ERROR("Error not a 2-dimensional matrix\n") \
176  matrix=SG_MALLOC(sg_type, nelements); \
177  num_feat=dims[0]; \
178  num_vec=dims[1]; \
179  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
180  H5S_ALL, H5P_DEFAULT, matrix); \
181  H5Dclose(dataset); \
182  H5Tclose(dtype); \
183  SG_FREE(dims); \
184  if (status<0) \
185  { \
186  SG_FREE(matrix); \
187  SG_ERROR("Error reading dataset\n") \
188  } \
189 }
190 
191 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
192 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
193 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
194 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
195 GET_MATRIX(get_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
196 GET_MATRIX(get_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
197 GET_MATRIX(get_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
198 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
199 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
200 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
201 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
202 GET_MATRIX(get_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
203 #undef GET_MATRIX
204 
205 void CMLDataHDF5File::get_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
206 {
207 }
208 
209 void CMLDataHDF5File::get_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
210 {
211 }
212 
213 void CMLDataHDF5File::get_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
214 {
215 }
216 
217 void CMLDataHDF5File::get_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
218 {
219 }
220 
221 void CMLDataHDF5File::get_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
222 {
223 }
224 
225 void CMLDataHDF5File::get_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
226 {
227 }
228 
229 void CMLDataHDF5File::get_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
230 {
231 }
232 
233 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
234 void CMLDataHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
235 { \
236  if (!(file)) \
237  SG_ERROR("File invalid.\n") \
238 }
239 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL)
240 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR)
241 GET_SPARSEMATRIX(get_sparse_matrix, int8_t, DT_SPARSE_INT8)
242 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
243 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT)
244 GET_SPARSEMATRIX(get_sparse_matrix, uint32_t, DT_SPARSE_UINT)
245 GET_SPARSEMATRIX(get_sparse_matrix, int64_t, DT_SPARSE_LONG)
246 GET_SPARSEMATRIX(get_sparse_matrix, uint64_t, DT_SPARSE_ULONG)
247 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT)
248 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD)
249 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
250 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL)
251 GET_SPARSEMATRIX(get_sparse_matrix, floatmax_t, DT_SPARSE_LONGREAL)
252 #undef GET_SPARSEMATRIX
253 
254 
255 #define GET_STRING_LIST(fname, sg_type, datatype) \
256 void CMLDataHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
257 { \
258 }
259 
260 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL)
261 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR)
262 GET_STRING_LIST(get_string_list, int8_t, DT_STRING_INT8)
263 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE)
264 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT)
265 GET_STRING_LIST(get_string_list, uint32_t, DT_STRING_UINT)
266 GET_STRING_LIST(get_string_list, int64_t, DT_STRING_LONG)
267 GET_STRING_LIST(get_string_list, uint64_t, DT_STRING_ULONG)
268 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT)
269 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD)
270 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL)
271 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL)
272 GET_STRING_LIST(get_string_list, floatmax_t, DT_STRING_LONGREAL)
273 #undef GET_STRING_LIST
274 
275 void CMLDataHDF5File::get_boolean_type()
276 {
277  boolean_type=H5T_NATIVE_UCHAR;
278  switch (sizeof(bool))
279  {
280  case 1:
281  boolean_type = H5T_NATIVE_UCHAR;
282  break;
283  case 2:
284  boolean_type = H5T_NATIVE_UINT16;
285  break;
286  case 4:
287  boolean_type = H5T_NATIVE_UINT32;
288  break;
289  case 8:
290  boolean_type = H5T_NATIVE_UINT64;
291  break;
292  default:
293  SG_ERROR("Boolean type not supported on this platform\n")
294  }
295 }
296 
297 hid_t CMLDataHDF5File::get_compatible_type(H5T_class_t t_class,
298  const TSGDataType* datatype)
299 {
300  switch (t_class)
301  {
302  case H5T_FLOAT:
303  case H5T_INTEGER:
304  switch (datatype->m_ptype)
305  {
306  case PT_BOOL: return boolean_type;
307  case PT_CHAR: return H5T_NATIVE_CHAR;
308  case PT_INT8: return H5T_NATIVE_INT8;
309  case PT_UINT8: return H5T_NATIVE_UINT8;
310  case PT_INT16: return H5T_NATIVE_INT16;
311  case PT_UINT16: return H5T_NATIVE_UINT16;
312  case PT_INT32: return H5T_NATIVE_INT32;
313  case PT_UINT32: return H5T_NATIVE_UINT32;
314  case PT_INT64: return H5T_NATIVE_INT64;
315  case PT_UINT64: return H5T_NATIVE_UINT64;
316  case PT_FLOAT32: return H5T_NATIVE_FLOAT;
317  case PT_FLOAT64: return H5T_NATIVE_DOUBLE;
318  case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE;
319  case PT_COMPLEX128:
320  SG_ERROR("complex128_t not compatible with HDF5File!");
321  return -1;
322  case PT_SGOBJECT:
323  SG_ERROR("Implementation error during writing "
324  "HDF5File!");
325  return -1;
326  }
327  case H5T_STRING:
328  SG_ERROR("Strings not supported")
329  return -1;
330  case H5T_VLEN:
331  SG_ERROR("Variable length containers currently not supported")
332  return -1;
333  case H5T_ARRAY:
334  SG_ERROR("Array containers currently not supported")
335  return -1;
336  default:
337  SG_ERROR("Datatype mismatchn")
338  return -1;
339  }
340 }
341 
342 void CMLDataHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements)
343 {
344  hid_t dataspace = H5Dget_space(dataset);
345  if (dataspace<0)
346  SG_ERROR("Error obtaining hdf5 dataspace\n")
347 
348  ndims = H5Sget_simple_extent_ndims(dataspace);
349  total_elements=H5Sget_simple_extent_npoints(dataspace);
350  hsize_t* dims_out=SG_MALLOC(hsize_t, ndims);
351  dims=SG_MALLOC(int32_t, ndims);
352  H5Sget_simple_extent_dims(dataspace, dims_out, NULL);
353  for (int32_t i=0; i<ndims; i++)
354  dims[i]=dims_out[i];
355  SG_FREE(dims_out);
356  H5Sclose(dataspace);
357 }
358 
359 void CMLDataHDF5File::create_group_hierarchy()
360 {
361  char* vname=get_strdup(variable_name);
362  int32_t vlen=strlen(vname);
363  for (int32_t i=0; i<vlen; i++)
364  {
365  if (i!=0 && vname[i]=='/')
366  {
367  vname[i]='\0';
368  hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT);
369  if (g<0)
370  {
371  g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT,
372  H5P_DEFAULT);
373  if (g<0)
374  SG_ERROR("Error creating group '%s'\n", vname)
375  vname[i]='/';
376  }
377  H5Gclose(g);
378  }
379  }
380  SG_FREE(vname);
381 }
382 #endif // HAVE_CURL && HAVE_HDF5

SHOGUN Machine Learning Toolbox - Documentation