// Borrowed from Theano
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
template <typename Dtype>
-__global__ void im3d2col_kernel(const int64_t n, const Dtype* data_im,
- const int64_t height, const int64_t width, const int64_t depth,
- const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
- const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
- const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
- const int64_t height_col, const int64_t width_col, const int64_t depth_col,
- Dtype* data_col)
+__global__ void __launch_bounds__(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
+im3d2col_kernel(const int64_t n, const Dtype* data_im,
+ const int64_t height, const int64_t width, const int64_t depth,
+ const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
+ Dtype* data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
}
template <typename Dtype, typename Acctype>
-__global__ void col2im3d_kernel(const int64_t n, const Dtype* data_col,
- const int64_t height, const int64_t width, const int64_t depth,
- const int64_t channels,
- const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
- const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
- const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
- const int64_t height_col, const int64_t width_col, const int64_t depth_col,
- Dtype* data_im)
+__global__ void __launch_bounds__(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
+col2im3d_kernel(const int64_t n, const Dtype* data_col,
+ const int64_t height, const int64_t width, const int64_t depth,
+ const int64_t channels,
+ const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
+ Dtype* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{