From: Alexander Alekhin Date: Thu, 23 Nov 2017 20:10:53 +0000 (+0000) Subject: dnn(ocl4dnn): refactor pooling OpenCL calls X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~375^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e34b64c979185bb76528719fb07530377d0479df;p=platform%2Fupstream%2Fopencv.git dnn(ocl4dnn): refactor pooling OpenCL calls --- diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index 4a62546..9e202b4 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -351,8 +351,6 @@ class OCL4DNNPool UMat& top_data, UMat& top_mask); private: - UMat mask_idx_; - // Pooling parameters std::vector pad_; std::vector stride_; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp index fe8b84b..13434d9 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp @@ -88,7 +88,7 @@ OCL4DNNPool::OCL4DNNPool(OCL4DNNPoolConfig config) template OCL4DNNPool::~OCL4DNNPool() { - mask_idx_.release(); + // nothing } template @@ -99,99 +99,103 @@ bool OCL4DNNPool::Forward(const UMat& bottom, bool ret = true; size_t global[] = { 128 * 128 }; size_t local[] = { 128 }; - cl_uint argIdx = 0; // support 2D case switch (pool_method_) { case LIBDNN_POOLING_METHOD_MAX: { - if (top_mask.empty() && mask_idx_.empty()) - { - mask_idx_.create(1, count_, CV_32FC1); - } - ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"), - cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + bool haveMask = !top_mask.empty(); + ocl::Kernel oclk_max_pool_forward( + haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"), + ocl::dnn::ocl4dnn_pooling_oclsrc, + format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" + " -D STRIDE_W=%d -D STRIDE_H=%d" + " -D PAD_W=%d -D PAD_H=%d%s", + kernel_w_, kernel_h_, + stride_w_, stride_h_, + pad_w_, pad_h_, + haveMask ? " -D HAVE_MASK=1" : "" + )); if (oclk_max_pool_forward.empty()) return false; - argIdx = 0; - oclk_max_pool_forward.set(argIdx++, count_); - oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - oclk_max_pool_forward.set(argIdx++, batch_size_); - oclk_max_pool_forward.set(argIdx++, channels_); - oclk_max_pool_forward.set(argIdx++, height_); - oclk_max_pool_forward.set(argIdx++, width_); - oclk_max_pool_forward.set(argIdx++, pooled_height_); - oclk_max_pool_forward.set(argIdx++, pooled_width_); - oclk_max_pool_forward.set(argIdx++, kernel_h_); - oclk_max_pool_forward.set(argIdx++, kernel_w_); - oclk_max_pool_forward.set(argIdx++, stride_h_); - oclk_max_pool_forward.set(argIdx++, stride_w_); - oclk_max_pool_forward.set(argIdx++, pad_h_); - oclk_max_pool_forward.set(argIdx++, pad_w_); - oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); - oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1); - if (mask_idx_.empty()) - oclk_max_pool_forward.set(argIdx++, (void *)NULL); - else - oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_)); - oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask)); + oclk_max_pool_forward.args( + count_, + ocl::KernelArg::PtrReadOnly(bottom), + batch_size_, + channels_, + height_, + width_, + pooled_height_, + pooled_width_, + ocl::KernelArg::PtrWriteOnly(top), + ocl::KernelArg::PtrWriteOnly(top_mask) + ); ret = oclk_max_pool_forward.run(1, global, local, false); } break; case LIBDNN_POOLING_METHOD_AVE: { + CV_Assert(top_mask.empty()); + ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), - cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + ocl::dnn::ocl4dnn_pooling_oclsrc, + format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" + " -D STRIDE_W=%d -D STRIDE_H=%d" + " -D PAD_W=%d -D PAD_H=%d", + kernel_w_, kernel_h_, + stride_w_, stride_h_, + pad_w_, pad_h_ + )); if (oclk_ave_pool_forward.empty()) return false; - argIdx = 0; - oclk_ave_pool_forward.set(argIdx++, count_); - oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - oclk_ave_pool_forward.set(argIdx++, batch_size_); - oclk_ave_pool_forward.set(argIdx++, channels_); - oclk_ave_pool_forward.set(argIdx++, height_); - oclk_ave_pool_forward.set(argIdx++, width_); - oclk_ave_pool_forward.set(argIdx++, pooled_height_); - oclk_ave_pool_forward.set(argIdx++, pooled_width_); - oclk_ave_pool_forward.set(argIdx++, kernel_h_); - oclk_ave_pool_forward.set(argIdx++, kernel_w_); - oclk_ave_pool_forward.set(argIdx++, stride_h_); - oclk_ave_pool_forward.set(argIdx++, stride_w_); - oclk_ave_pool_forward.set(argIdx++, pad_h_); - oclk_ave_pool_forward.set(argIdx++, pad_w_); - oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + oclk_ave_pool_forward.args( + count_, + ocl::KernelArg::PtrReadOnly(bottom), + batch_size_, + channels_, + height_, + width_, + pooled_height_, + pooled_width_, + ocl::KernelArg::PtrWriteOnly(top) + ); ret = oclk_ave_pool_forward.run(1, global, local, false); } break; case LIBDNN_POOLING_METHOD_STO: { + CV_Assert(top_mask.empty()); + ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), - cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + ocl::dnn::ocl4dnn_pooling_oclsrc, + format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" + " -D STRIDE_W=%d -D STRIDE_H=%d", + kernel_w_, kernel_h_, + stride_w_, stride_h_ + )); + if (oclk_sto_pool_forward.empty()) return false; - argIdx = 0; - oclk_sto_pool_forward.set(argIdx++, count_); - oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - oclk_sto_pool_forward.set(argIdx++, batch_size_); - oclk_sto_pool_forward.set(argIdx++, channels_); - oclk_sto_pool_forward.set(argIdx++, height_); - oclk_sto_pool_forward.set(argIdx++, width_); - oclk_sto_pool_forward.set(argIdx++, pooled_height_); - oclk_sto_pool_forward.set(argIdx++, pooled_width_); - oclk_sto_pool_forward.set(argIdx++, kernel_h_); - oclk_sto_pool_forward.set(argIdx++, kernel_w_); - oclk_sto_pool_forward.set(argIdx++, stride_h_); - oclk_sto_pool_forward.set(argIdx++, stride_w_); - oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + oclk_sto_pool_forward.args( + count_, + ocl::KernelArg::PtrReadOnly(bottom), + batch_size_, + channels_, + height_, + width_, + pooled_height_, + pooled_width_, + ocl::KernelArg::PtrWriteOnly(top) + ); ret = oclk_sto_pool_forward.run(1, global, local, false); } diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl index 326d5bc..218b6b4 100644 --- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl +++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl @@ -44,14 +44,23 @@ #define TEMPLATE(name,type) CONCAT(name,type) #define Dtype float -void TEMPLATE(max_pool_forward_impl, Dtype)( +#if defined KERNEL_MAX_POOL + +__kernel void +#ifdef HAVE_MASK + TEMPLATE(max_pool_forward_mask, Dtype) +#else + TEMPLATE(max_pool_forward, Dtype) +#endif +( const int nthreads, __global const Dtype* bottom_data, const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, - __global Dtype* top_data, - const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask) + const int pooled_height, const int pooled_width, + __global Dtype* top_data +#ifdef HAVE_MASK + , __global Dtype* mask +#endif +) { for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) @@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)( const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); + int hstart = ph * STRIDE_H - PAD_H; + int wstart = pw * STRIDE_W - PAD_W; + const int hend = min(hstart + KERNEL_H, height); + const int wend = min(wstart + KERNEL_W, width); hstart = max(hstart, (int)0); wstart = max(wstart, (int)0); Dtype maxval = -FLT_MAX; @@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)( } } top_data[index] = maxval; - if (!no_mask) { - if (use_mask == 1) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } +#ifdef HAVE_MASK + mask[index] = maxidx; +#endif } } -__kernel void TEMPLATE(max_pool_forward, Dtype)( - const int nthreads, __global const Dtype* bottom_data, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, - __global Dtype* top_data, - const int use_mask, __global int* mask, __global Dtype* top_mask) -{ - TEMPLATE(max_pool_forward_impl, Dtype)( - nthreads, bottom_data, num, channels, height, width, - pooled_height, pooled_width, kernel_h, - kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false - ); -} +#elif defined KERNEL_AVE_POOL __kernel void TEMPLATE(ave_pool_forward, Dtype)( const int nthreads, __global const Dtype* const bottom_data, const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, __global Dtype* top_data) + const int pooled_height, const int pooled_width, + __global Dtype* top_data) { for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) @@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)( const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); + int hstart = ph * STRIDE_H - PAD_H; + int wstart = pw * STRIDE_W - PAD_W; + int hend = min(hstart + KERNEL_H, height + PAD_H); + int wend = min(wstart + KERNEL_W, width + PAD_W); const int pool_size = (hend - hstart) * (wend - wstart); hstart = max(hstart, (int)0); wstart = max(wstart, (int)0); @@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)( } } +#elif defined KERNEL_STO_POOL + __kernel void TEMPLATE(sto_pool_forward_test,Dtype)( const int nthreads, __global const Dtype* const bottom_data, const int num, const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, + const int pooled_height, const int pooled_width, __global Dtype* top_data) { for (int index = get_global_id(0); index < nthreads; @@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)( const int ph = (index / pooled_width) % pooled_height; const int c = (index / pooled_width / pooled_height) % channels; const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); + const int hstart = ph * STRIDE_H; + const int hend = min(hstart + KERNEL_H, height); + const int wstart = pw * STRIDE_W; + const int wend = min(wstart + KERNEL_W, width); // We set cumsum to be 0 to avoid divide-by-zero problems Dtype cumsum = FLT_MIN; Dtype cumvalues = 0.; @@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)( // First pass: get sum for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; + Dtype v = bottom_slice[h * width + w]; + cumsum += v; + cumvalues += v * v; } } top_data[index] = cumvalues / cumsum; } } + +#endif // KERNEL_*