SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLDataHDF5File.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Copyright (C) 2013 Zhengyang Liu (zhengyangl)
8  */
9 
10 #include <shogun/lib/config.h>
11 
12 #if defined(HAVE_HDF5) && defined( HAVE_CURL)
13 
14 #include <stdio.h>
15 #include <string.h>
16 #include <hdf5.h>
17 #include <curl/curl.h>
18 #include <shogun/lib/memory.h>
20 
21 using namespace shogun;
22 
23 CMLDataHDF5File::CMLDataHDF5File()
24 {
25  SG_UNSTABLE("CMLDataHDF5File::CMLDataHDF5File()", "\n")
26 
27  get_boolean_type();
28  h5file = -1;
29 }
30 
31 size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
32  size_t written = fwrite(ptr, size, nmemb, stream);
33  return written;
34 }
35 
36 CMLDataHDF5File::CMLDataHDF5File(char* data_name,
37  const char* name,
38  const char* url_prefix) : CFile()
39 {
40  get_boolean_type();
41  H5Eset_auto2(H5E_DEFAULT, NULL, NULL);
42 
43  if (name)
44  set_variable_name(name);
45 
46  CURL *curl;
47  FILE *fp=NULL;
48 
49  mldata_url = SG_CALLOC(char, strlen(url_prefix)+strlen(data_name)+1);
50  strcat(mldata_url, url_prefix);
51  strcat(mldata_url, data_name);
52 
53  fname = SG_CALLOC(char, strlen((char*)"/tmp/")+strlen(data_name)+strlen((char*)".h5")+1);
54  strcat(fname, (char*) "/tmp/");
55  strcat(fname, data_name);
56  strcat(fname, (char*) ".h5");
57 
58  curl = curl_easy_init();
59  fp = fopen(fname,"wb");
60 
61  if (!fp)
62  {
63  SG_ERROR("Could not open file '%s'\n", fname)
64  return;
65  }
66 
67  if (curl) {
68  curl_easy_setopt(curl, CURLOPT_URL, mldata_url);
69  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data);
70  curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
71  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
72  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
73  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
74  curl_easy_perform(curl);
75  curl_easy_cleanup(curl);
76  }
77 
78  if(fp)
79  fclose(fp);
80 
81  h5file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
82 
83  if (h5file<0)
84  SG_ERROR("Could not open data repository '%s'\n", data_name)
85 }
86 
87 CMLDataHDF5File::~CMLDataHDF5File()
88 {
89  H5Fclose(h5file);
90  remove(fname);
91  SG_FREE(fname);
92  SG_FREE(mldata_url);
93 }
94 
95 #define GET_VECTOR(fname, sg_type, datatype) \
96 void CMLDataHDF5File::fname(sg_type*& vec, int32_t& len) \
97 { \
98  if (!h5file) \
99  SG_ERROR("File invalid.\n") \
100  \
101  int32_t* dims; \
102  int32_t ndims; \
103  int64_t nelements; \
104  hid_t dataset=H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
105  if (dataset<0) \
106  SG_ERROR("Error opening data set\n") \
107  hid_t dtype=H5Dget_type(dataset); \
108  H5T_class_t t_class=H5Tget_class(dtype); \
109  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
110  if (h5_type==-1) \
111  { \
112  H5Dclose(dataset); \
113  SG_INFO("No compatible datatype found\n") \
114  } \
115  get_dims(dataset, dims, ndims, nelements); \
116  if (!((ndims==2 && dims[0]==nelements && dims[1]==1) || \
117  (ndims==2 && dims[0]==1 && dims[1]==nelements) || \
118  (ndims==1 && dims[0]==nelements))) \
119  SG_ERROR("Error not a 1-dimensional vector (ndims=%d, dims[0]=%d)\n", ndims, dims[0]) \
120  vec=SG_MALLOC(sg_type, nelements); \
121  len=nelements; \
122  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
123  H5S_ALL, H5P_DEFAULT, vec); \
124  H5Dclose(dataset); \
125  H5Tclose(dtype); \
126  SG_FREE(dims); \
127  if (status<0) \
128  { \
129  SG_FREE(vec); \
130  SG_ERROR("Error reading dataset\n") \
131  } \
132 }
133 
134 GET_VECTOR(get_vector, bool, (CT_VECTOR, ST_NONE, PT_BOOL))
135 GET_VECTOR(get_vector, int8_t, (CT_VECTOR, ST_NONE, PT_INT8))
136 GET_VECTOR(get_vector, uint8_t, (CT_VECTOR, ST_NONE, PT_UINT8))
137 GET_VECTOR(get_vector, char, (CT_VECTOR, ST_NONE, PT_CHAR))
138 GET_VECTOR(get_vector, int32_t, (CT_VECTOR, ST_NONE, PT_INT32))
139 GET_VECTOR(get_vector, uint32_t, (CT_VECTOR, ST_NONE, PT_UINT32))
140 GET_VECTOR(get_vector, float32_t, (CT_VECTOR, ST_NONE, PT_FLOAT32))
141 GET_VECTOR(get_vector, float64_t, (CT_VECTOR, ST_NONE, PT_FLOAT64))
142 GET_VECTOR(get_vector, floatmax_t, (CT_VECTOR, ST_NONE, PT_FLOATMAX))
143 GET_VECTOR(get_vector, int16_t, (CT_VECTOR, ST_NONE, PT_INT16))
144 GET_VECTOR(get_vector, uint16_t, (CT_VECTOR, ST_NONE, PT_INT16))
145 GET_VECTOR(get_vector, int64_t, (CT_VECTOR, ST_NONE, PT_INT64))
146 GET_VECTOR(get_vector, uint64_t, (CT_VECTOR, ST_NONE, PT_UINT64))
147 #undef GET_VECTOR
148 
149 #define GET_MATRIX(fname, sg_type, datatype) \
150 void CMLDataHDF5File::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
151 { \
152  if (!h5file) \
153  SG_ERROR("File invalid.\n") \
154  \
155  int32_t* dims; \
156  int32_t ndims; \
157  int64_t nelements; \
158  hid_t dataset = H5Dopen2(h5file, variable_name, H5P_DEFAULT); \
159  if (dataset<0) \
160  SG_ERROR("Error opening data set\n") \
161  hid_t dtype = H5Dget_type(dataset); \
162  H5T_class_t t_class=H5Tget_class(dtype); \
163  TSGDataType t datatype; hid_t h5_type=get_compatible_type(t_class, &t); \
164  if (h5_type==-1) \
165  { \
166  H5Dclose(dataset); \
167  SG_INFO("No compatible datatype found\n") \
168  } \
169  get_dims(dataset, dims, ndims, nelements); \
170  if (ndims!=2) \
171  SG_ERROR("Error not a 2-dimensional matrix\n") \
172  matrix=SG_MALLOC(sg_type, nelements); \
173  num_feat=dims[0]; \
174  num_vec=dims[1]; \
175  herr_t status = H5Dread(dataset, h5_type, H5S_ALL, \
176  H5S_ALL, H5P_DEFAULT, matrix); \
177  H5Dclose(dataset); \
178  H5Tclose(dtype); \
179  SG_FREE(dims); \
180  if (status<0) \
181  { \
182  SG_FREE(matrix); \
183  SG_ERROR("Error reading dataset\n") \
184  } \
185 }
186 
187 GET_MATRIX(get_matrix, bool, (CT_MATRIX, ST_NONE, PT_BOOL))
188 GET_MATRIX(get_matrix, char, (CT_MATRIX, ST_NONE, PT_CHAR))
189 GET_MATRIX(get_matrix, uint8_t, (CT_MATRIX, ST_NONE, PT_UINT8))
190 GET_MATRIX(get_matrix, int32_t, (CT_MATRIX, ST_NONE, PT_INT32))
191 GET_MATRIX(get_matrix, uint32_t, (CT_MATRIX, ST_NONE, PT_INT32))
192 GET_MATRIX(get_matrix, int64_t, (CT_MATRIX, ST_NONE, PT_INT64))
193 GET_MATRIX(get_matrix, uint64_t, (CT_MATRIX, ST_NONE, PT_INT64))
194 GET_MATRIX(get_matrix, int16_t, (CT_MATRIX, ST_NONE, PT_INT16))
195 GET_MATRIX(get_matrix, uint16_t, (CT_MATRIX, ST_NONE, PT_INT16))
196 GET_MATRIX(get_matrix, float32_t, (CT_MATRIX, ST_NONE, PT_FLOAT32))
197 GET_MATRIX(get_matrix, float64_t, (CT_MATRIX, ST_NONE, PT_FLOAT64))
198 GET_MATRIX(get_matrix, floatmax_t, (CT_MATRIX, ST_NONE, PT_FLOATMAX))
199 #undef GET_MATRIX
200 
201 void CMLDataHDF5File::get_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
202 {
203 }
204 
205 void CMLDataHDF5File::get_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
206 {
207 }
208 
209 void CMLDataHDF5File::get_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
210 {
211 }
212 
213 void CMLDataHDF5File::get_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
214 {
215 }
216 
217 void CMLDataHDF5File::get_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
218 {
219 }
220 
221 void CMLDataHDF5File::get_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
222 {
223 }
224 
225 void CMLDataHDF5File::get_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
226 {
227 }
228 
229 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
230 void CMLDataHDF5File::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
231 { \
232  if (!(file)) \
233  SG_ERROR("File invalid.\n") \
234 }
235 GET_SPARSEMATRIX(get_sparse_matrix, bool, DT_SPARSE_BOOL)
236 GET_SPARSEMATRIX(get_sparse_matrix, char, DT_SPARSE_CHAR)
237 GET_SPARSEMATRIX(get_sparse_matrix, int8_t, DT_SPARSE_INT8)
238 GET_SPARSEMATRIX(get_sparse_matrix, uint8_t, DT_SPARSE_BYTE)
239 GET_SPARSEMATRIX(get_sparse_matrix, int32_t, DT_SPARSE_INT)
240 GET_SPARSEMATRIX(get_sparse_matrix, uint32_t, DT_SPARSE_UINT)
241 GET_SPARSEMATRIX(get_sparse_matrix, int64_t, DT_SPARSE_LONG)
242 GET_SPARSEMATRIX(get_sparse_matrix, uint64_t, DT_SPARSE_ULONG)
243 GET_SPARSEMATRIX(get_sparse_matrix, int16_t, DT_SPARSE_SHORT)
244 GET_SPARSEMATRIX(get_sparse_matrix, uint16_t, DT_SPARSE_WORD)
245 GET_SPARSEMATRIX(get_sparse_matrix, float32_t, DT_SPARSE_SHORTREAL)
246 GET_SPARSEMATRIX(get_sparse_matrix, float64_t, DT_SPARSE_REAL)
247 GET_SPARSEMATRIX(get_sparse_matrix, floatmax_t, DT_SPARSE_LONGREAL)
248 #undef GET_SPARSEMATRIX
249 
250 
251 #define GET_STRING_LIST(fname, sg_type, datatype) \
252 void CMLDataHDF5File::fname(SGString<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
253 { \
254 }
255 
256 GET_STRING_LIST(get_string_list, bool, DT_STRING_BOOL)
257 GET_STRING_LIST(get_string_list, char, DT_STRING_CHAR)
258 GET_STRING_LIST(get_string_list, int8_t, DT_STRING_INT8)
259 GET_STRING_LIST(get_string_list, uint8_t, DT_STRING_BYTE)
260 GET_STRING_LIST(get_string_list, int32_t, DT_STRING_INT)
261 GET_STRING_LIST(get_string_list, uint32_t, DT_STRING_UINT)
262 GET_STRING_LIST(get_string_list, int64_t, DT_STRING_LONG)
263 GET_STRING_LIST(get_string_list, uint64_t, DT_STRING_ULONG)
264 GET_STRING_LIST(get_string_list, int16_t, DT_STRING_SHORT)
265 GET_STRING_LIST(get_string_list, uint16_t, DT_STRING_WORD)
266 GET_STRING_LIST(get_string_list, float32_t, DT_STRING_SHORTREAL)
267 GET_STRING_LIST(get_string_list, float64_t, DT_STRING_REAL)
268 GET_STRING_LIST(get_string_list, floatmax_t, DT_STRING_LONGREAL)
269 #undef GET_STRING_LIST
270 
271 void CMLDataHDF5File::get_boolean_type()
272 {
273  boolean_type=H5T_NATIVE_UCHAR;
274  switch (sizeof(bool))
275  {
276  case 1:
277  boolean_type = H5T_NATIVE_UCHAR;
278  break;
279  case 2:
280  boolean_type = H5T_NATIVE_UINT16;
281  break;
282  case 4:
283  boolean_type = H5T_NATIVE_UINT32;
284  break;
285  case 8:
286  boolean_type = H5T_NATIVE_UINT64;
287  break;
288  default:
289  SG_ERROR("Boolean type not supported on this platform\n")
290  }
291 }
292 
293 hid_t CMLDataHDF5File::get_compatible_type(H5T_class_t t_class,
294  const TSGDataType* datatype)
295 {
296  switch (t_class)
297  {
298  case H5T_FLOAT:
299  case H5T_INTEGER:
300  switch (datatype->m_ptype)
301  {
302  case PT_BOOL: return boolean_type;
303  case PT_CHAR: return H5T_NATIVE_CHAR;
304  case PT_INT8: return H5T_NATIVE_INT8;
305  case PT_UINT8: return H5T_NATIVE_UINT8;
306  case PT_INT16: return H5T_NATIVE_INT16;
307  case PT_UINT16: return H5T_NATIVE_UINT16;
308  case PT_INT32: return H5T_NATIVE_INT32;
309  case PT_UINT32: return H5T_NATIVE_UINT32;
310  case PT_INT64: return H5T_NATIVE_INT64;
311  case PT_UINT64: return H5T_NATIVE_UINT64;
312  case PT_FLOAT32: return H5T_NATIVE_FLOAT;
313  case PT_FLOAT64: return H5T_NATIVE_DOUBLE;
314  case PT_FLOATMAX: return H5T_NATIVE_LDOUBLE;
315  case PT_COMPLEX128:
316  SG_ERROR("complex128_t not compatible with HDF5File!");
317  return -1;
318  case PT_UNDEFINED:
319  case PT_SGOBJECT:
320  SG_ERROR("Implementation error during writing "
321  "HDF5File!");
322  return -1;
323  }
324  case H5T_STRING:
325  SG_ERROR("Strings not supported")
326  return -1;
327  case H5T_VLEN:
328  SG_ERROR("Variable length containers currently not supported")
329  return -1;
330  case H5T_ARRAY:
331  SG_ERROR("Array containers currently not supported")
332  return -1;
333  default:
334  SG_ERROR("Datatype mismatchn")
335  return -1;
336  }
337 }
338 
339 void CMLDataHDF5File::get_dims(hid_t dataset, int32_t*& dims, int32_t& ndims, int64_t& total_elements)
340 {
341  hid_t dataspace = H5Dget_space(dataset);
342  if (dataspace<0)
343  SG_ERROR("Error obtaining hdf5 dataspace\n")
344 
345  ndims = H5Sget_simple_extent_ndims(dataspace);
346  total_elements=H5Sget_simple_extent_npoints(dataspace);
347  hsize_t* dims_out=SG_MALLOC(hsize_t, ndims);
348  dims=SG_MALLOC(int32_t, ndims);
349  H5Sget_simple_extent_dims(dataspace, dims_out, NULL);
350  for (int32_t i=0; i<ndims; i++)
351  dims[i]=dims_out[i];
352  SG_FREE(dims_out);
353  H5Sclose(dataspace);
354 }
355 
356 void CMLDataHDF5File::create_group_hierarchy()
357 {
358  char* vname=get_strdup(variable_name);
359  int32_t vlen=strlen(vname);
360  for (int32_t i=0; i<vlen; i++)
361  {
362  if (i!=0 && vname[i]=='/')
363  {
364  vname[i]='\0';
365  hid_t g = H5Gopen2(h5file, vname, H5P_DEFAULT);
366  if (g<0)
367  {
368  g=H5Gcreate2(h5file, vname, H5P_DEFAULT, H5P_DEFAULT,
369  H5P_DEFAULT);
370  if (g<0)
371  SG_ERROR("Error creating group '%s'\n", vname)
372  vname[i]='/';
373  }
374  H5Gclose(g);
375  }
376  }
377  SG_FREE(vname);
378 }
379 #endif // HAVE_CURL && HAVE_HDF5

SHOGUN Machine Learning Toolbox - Documentation