SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Convolve.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Khaled Nasr
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #ifndef CONVOLVE_H_
32 #define CONVOLVE_H_
33 
34 #include <shogun/lib/config.h>
35 #include <shogun/lib/SGMatrix.h>
37 
38 #include <shogun/io/SGIO.h>
39 
41 
42 #ifdef HAVE_VIENNACL
43 #include <shogun/lib/GPUMatrix.h>
45 #endif // HAVE_VIENNACL
46 
47 namespace shogun
48 {
49 
50 namespace linalg
51 {
52 
53 namespace implementation
54 {
55 
59 template <enum Backend, class Matrix>
60 struct convolve
61 {
63  typedef typename Matrix::Scalar T;
64 
81  static void compute(Matrix X, Matrix W, Matrix Y, bool flip ,
82  bool overwrite, int32_t stride_x, int32_t stride_y);
83 };
84 
85 
87 template <class Matrix>
88 struct convolve<Backend::EIGEN3, Matrix>
89 {
91  typedef typename Matrix::Scalar T;
92 
95 
98 
112  static void compute(SGMatrix<T> X, SGMatrix<T> W, SGMatrix<T> Y, bool flip ,
113  bool overwrite, int32_t stride_x, int32_t stride_y)
114  {
115  int32_t width = X.num_cols;
116  int32_t height = X.num_rows;
117 
118  int32_t kx = W.num_cols;
119  int32_t ky = W.num_rows;
120 
121  int32_t rx = (kx-1)/2;
122  int32_t ry = (ky-1)/2;
123 
124  for (int32_t x=0; x<width; x+=stride_x)
125  {
126  int32_t xout = x/stride_x;
127 
128  for (int32_t y=0; y<height; y+=stride_y)
129  {
130  int32_t yout = y/stride_y;
131 
132  T sum = overwrite ? 0 : Y(yout,xout);
133  for (int32_t x1=x-rx; x1<=x+rx; x1++)
134  {
135  int32_t wx = flip ? x1-x+rx : rx-x1+x;
136  for (int32_t y1=y-ry; y1<=y+ry; y1++)
137  {
138  if (x1>=0 && y1>=0 && x1<width && y1<height)
139  {
140  if (flip)
141  sum += W(y1-y+ry,wx)*X(y1,x1);
142  else
143  sum += W(ry-y1+y,wx)*X(y1,x1);
144  }
145  }
146  }
147  Y(yout,xout) = sum;
148  }
149  }
150  }
151 };
152 
153 #ifdef HAVE_VIENNACL
154 
156 template <class Matrix>
157 struct convolve<Backend::VIENNACL, Matrix>
158 {
160  typedef typename Matrix::Scalar T;
161 
163  template <class T>
164  static viennacl::ocl::kernel& generate_kernel_unity_stride(
165  int32_t radius_x, int32_t radius_y, bool flip, bool overwrite)
166  {
167  std::string kernel_name =
168  "convolve_unity_stride_" + ocl::get_type_string<T>() + "_" +
169  std::to_string(radius_x) + "_" + std::to_string(radius_y);
170 
171  if (flip) kernel_name.append("_flip");
172  if (overwrite) kernel_name.append("_overwrite");
173 
174  if (ocl::kernel_exists(kernel_name))
175  return ocl::get_kernel(kernel_name);
176 
177  std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
178 
179  if (flip) source.append("#define FLIP\n");
180  if (overwrite) source.append("#define OVERWRITE\n");
181 
182  source.append("#define RADIUS_X " + std::to_string(radius_x) + "\n");
183  source.append("#define RADIUS_Y " + std::to_string(radius_y) + "\n");
184 
185  source.append(
186  R"(
187  #define W_WIDTH (2*RADIUS_X+1)
188  #define W_HEIGHT (2*RADIUS_Y+1)
189 
190  #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
191  #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
192 
193  inline DATATYPE readX(read_only __global DATATYPE* X, int x, int y,
194  int X_width, int X_height, int X_offset)
195  {
196  if (x>=0 && y>=0 && x<X_width && y<X_height)
197  return X[y + x*X_height + X_offset];
198  else
199  return 0;
200  }
201 
202  __kernel void KERNEL_NAME(
203  read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
204  __constant DATATYPE* W, int W_offset,
205  __global DATATYPE* Y, int Y_offset)
206  {
207  __local DATATYPE X_local[X_LOCAL_WIDTH][X_LOCAL_HEIGHT];
208 
209  int x = get_global_id(0);
210  int y = get_global_id(1);
211 
212  int xl = get_local_id(0);
213  int yl = get_local_id(1);
214 
215  if (xl==WORK_GROUP_SIZE_2D-1 && yl == WORK_GROUP_SIZE_2D-1)
216  {
217  for (int rx=0; rx<=2*RADIUS_X; rx++)
218  for (int ry=0; ry<=2*RADIUS_Y; ry++)
219  X_local[xl+rx][yl+ry] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y+ry, X_width, X_height, X_offset);
220  }
221  else if (xl==WORK_GROUP_SIZE_2D-1)
222  {
223  for (int rx=0; rx<=2*RADIUS_X; rx++)
224  X_local[xl+rx][yl] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y, X_width, X_height, X_offset);
225  }
226  else if (yl == WORK_GROUP_SIZE_2D-1)
227  {
228  for (int ry=0; ry<=2*RADIUS_Y; ry++)
229  X_local[xl][yl+ry] = readX(X, x-RADIUS_X, y-RADIUS_Y+ry, X_width, X_height, X_offset);
230  }
231  else
232  X_local[xl][yl] = readX(X, x-RADIUS_X, y-RADIUS_Y, X_width, X_height, X_offset);
233 
234  barrier(CLK_LOCAL_MEM_FENCE);
235 
236  if (x>=X_width || y>=X_height)
237  return;
238 
239  DATATYPE sum = 0;
240  for (int x1=0; x1<W_WIDTH; x1++)
241  {
242  #ifdef FLIP
243  int wx = x1*W_HEIGHT+W_offset;
244  #else
245  int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
246  #endif
247  int inx = x1+xl;
248  for (int y1=0; y1<W_HEIGHT; y1++)
249  {
250  int iny = y1+yl;
251  #ifdef FLIP
252  sum += W[y1+wx]*X_local[inx][iny];
253  #else
254  sum += W[2*RADIUS_Y-y1+wx]*X_local[inx][iny];
255  #endif
256  }
257  }
258  #ifdef OVERWRITE
259  Y[y+X_height*x + Y_offset] = sum;
260  #else
261  Y[y+X_height*x + Y_offset] += sum;
262  #endif
263  }
264  )"
265  );
266 
267  viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
268 
269  kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
270  kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
271 
272  return kernel;
273  }
274 
276  template <class T>
277  static viennacl::ocl::kernel& generate_kernel_arbitrary_stride(
278  int32_t radius_x, int32_t radius_y, bool flip, bool overwrite)
279  {
280  std::string kernel_name =
281  "convolve_arbitrary_stride_" + ocl::get_type_string<T>() + "_" +
282  std::to_string(radius_x) + "_" + std::to_string(radius_y);
283 
284  if (flip) kernel_name.append("_flip");
285  if (overwrite) kernel_name.append("_overwrite");
286 
287  if (ocl::kernel_exists(kernel_name))
288  return ocl::get_kernel(kernel_name);
289 
290  std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
291 
292  if (flip) source.append("#define FLIP\n");
293  if (overwrite) source.append("#define OVERWRITE\n");
294 
295  source.append("#define RADIUS_X " + std::to_string(radius_x) + "\n");
296  source.append("#define RADIUS_Y " + std::to_string(radius_y) + "\n");
297 
298  source.append(
299  R"(
300  #define W_WIDTH (2*RADIUS_X+1)
301  #define W_HEIGHT (2*RADIUS_Y+1)
302 
303  #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
304  #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
305 
306  __kernel void KERNEL_NAME(
307  read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
308  __constant DATATYPE* W, int W_offset,
309  __global DATATYPE* Y, int Y_offset,
310  int stride_x, int stride_y)
311  {
312  __local DATATYPE X_local[WORK_GROUP_SIZE_2D][WORK_GROUP_SIZE_2D];
313 
314  int x = get_global_id(0)*stride_x;
315  int y = get_global_id(1)*stride_y;
316 
317  int Y_width = X_width/stride_x;
318  int Y_height = X_height/stride_y;
319 
320  if (get_global_id(0)>=Y_width || get_global_id(1)>=Y_height)
321  return;
322 
323  DATATYPE sum = 0;
324  for (int x1=0; x1<W_WIDTH; x1++)
325  {
326  #ifdef FLIP
327  int wx = x1*W_HEIGHT+W_offset;
328  #else
329  int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
330  #endif
331  int inx = x1+x-RADIUS_X;
332  for (int y1=0; y1<W_HEIGHT; y1++)
333  {
334  int iny = y1+y-RADIUS_Y;
335  if (inx>=0 && iny>=0 && inx<X_width && iny<X_height)
336  {
337  #ifdef FLIP
338  sum += W[y1+wx]*X[iny+inx*X_height+X_offset];
339  #else
340  sum += W[2*RADIUS_Y-y1+wx]*X[iny+inx*X_height+X_offset];
341  #endif
342  }
343  }
344  }
345  #ifdef OVERWRITE
346  Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] = sum;
347  #else
348  Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] += sum;
349  #endif
350  }
351  )"
352  );
353 
354  viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
355 
356  kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
357  kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
358 
359  return kernel;
360  }
361 
378  static void compute(CGPUMatrix<T> X, CGPUMatrix<T> W, CGPUMatrix<T> Y, bool flip ,
379  bool overwrite, int32_t stride_x, int32_t stride_y)
380  {
381  if (stride_x==1 && stride_y==1)
382  {
383  viennacl::ocl::kernel& kernel = generate_kernel_unity_stride<T>(
384  (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
385 
386  kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
387  kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
388 
389  viennacl::ocl::enqueue(kernel(
390  X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
391  W.vcl_matrix(), cl_int(W.offset),
392  Y.vcl_matrix(), cl_int(Y.offset)));
393  }
394  else
395  {
396  viennacl::ocl::kernel& kernel = generate_kernel_arbitrary_stride<T>(
397  (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
398 
399  kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
400  kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
401 
402  viennacl::ocl::enqueue(kernel(
403  X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
404  W.vcl_matrix(), cl_int(W.offset),
405  Y.vcl_matrix(), cl_int(Y.offset),
406  cl_int(stride_x), cl_int(stride_y)));
407  }
408  }
409 };
410 
411 #endif // HAVE_VIENNACL
412 
413 }
414 
415 }
416 
417 }
418 #endif // CONVOLVE_H_
Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > MatrixXt
Definition: Convolve.h:94
static void compute(SGMatrix< T > X, SGMatrix< T > W, SGMatrix< T > Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)
Definition: Convolve.h:112
index_t num_cols
Definition: SGMatrix.h:376
index_t num_rows
Definition: SGMatrix.h:374
Generic class sum which provides a static compute method. This class is specialized for different typ...
Definition: Sum.h:69
shogun matrix
static void compute(Matrix X, Matrix W, Matrix Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18

SHOGUN Machine Learning Toolbox - Documentation