From: Li Peng Date: Wed, 27 Jun 2018 07:15:56 +0000 (+0800) Subject: pooling ocl kernel optimization X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~608^2~6^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=145eae321efe76ebf7a9c425368b48fafefbd1da;p=platform%2Fupstream%2Fopencv.git pooling ocl kernel optimization set global size with real output size, also optimize max pooling index computation if necessary. Signed-off-by: Li Peng --- diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 6a7c9d5..50948f5 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1446,7 +1446,7 @@ struct Net::Impl // TODO: OpenCL target support more fusion styles. if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) && (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" && - ld.layerInstance->type != "MVN")) ) + ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) ) continue; Ptr& currLayer = ld.layerInstance; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index eab1dca..775a044 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -165,6 +165,7 @@ public: (type == AVE ? LIBDNN_POOLING_METHOD_AVE : LIBDNN_POOLING_METHOD_STO); config.avePoolPaddedArea = avePoolPaddedArea; + config.computeMaxIdx = computeMaxIdx; config.use_half = use_half; poolOp = Ptr >(new OCL4DNNPool(config)); } diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index f3a26a3..e0ca5ca 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -352,6 +352,7 @@ struct OCL4DNNPoolConfig pool_method(LIBDNN_POOLING_METHOD_MAX), global_pooling(false), avePoolPaddedArea(true), + computeMaxIdx(true), use_half(false) {} MatShape in_shape; @@ -365,6 +366,7 @@ struct OCL4DNNPoolConfig ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX; bool global_pooling; // = false; bool avePoolPaddedArea; + bool computeMaxIdx; bool use_half; }; @@ -399,6 +401,7 @@ class OCL4DNNPool int32_t pooled_height_; int32_t pooled_width_; bool avePoolPaddedArea; + bool computeMaxIdx; bool use_half; }; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp index 81238e9..b74bf4d 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp @@ -56,6 +56,7 @@ OCL4DNNPool::OCL4DNNPool(OCL4DNNPoolConfig config) channels_ = config.channels; pool_method_ = config.pool_method; avePoolPaddedArea = config.avePoolPaddedArea; + computeMaxIdx = config.computeMaxIdx; use_half = config.use_half; for (int i = 0; i < spatial_dims; ++i) @@ -97,7 +98,7 @@ bool OCL4DNNPool::Forward(const UMat& bottom, UMat& top_mask) { bool ret = true; - size_t global[] = { 128 * 128 }; + size_t global[] = { (size_t)count_ }; size_t local[] = { 128 }; // support 2D case @@ -105,8 +106,7 @@ bool OCL4DNNPool::Forward(const UMat& bottom, { case LIBDNN_POOLING_METHOD_MAX: { - bool haveMask = !top_mask.empty(); - String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward"; + String kname = computeMaxIdx ? "max_pool_forward_mask" : "max_pool_forward"; kname += (use_half) ? "_half" : "_float"; ocl::Kernel oclk_max_pool_forward( kname.c_str(), @@ -118,7 +118,7 @@ bool OCL4DNNPool::Forward(const UMat& bottom, kernel_w_, kernel_h_, stride_w_, stride_h_, pad_w_, pad_h_, - haveMask ? " -D HAVE_MASK=1" : "" + computeMaxIdx ? " -D HAVE_MASK=1" : "" )); if (oclk_max_pool_forward.empty()) diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl index e9d1d26..501f5a5 100644 --- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl +++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl @@ -65,36 +65,40 @@ __kernel void #endif ) { - for (int index = get_global_id(0); index < nthreads; - index += get_global_size(0)) + int index = get_global_id(0); + if (index >= nthreads) + return; + + const int pw = index % pooled_width; + const int xx = index / pooled_width; + const int ph = xx % pooled_height; + const int ch = xx / pooled_height; + int hstart = ph * STRIDE_H - PAD_H; + int wstart = pw * STRIDE_W - PAD_W; + Dtype maxval = -FLT_MAX; + int maxidx = -1; + int in_offset = ch * height * width; + for (int h = 0; h < KERNEL_H; ++h) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * STRIDE_H - PAD_H; - int wstart = pw * STRIDE_W - PAD_W; - const int hend = min(hstart + KERNEL_H, height); - const int wend = min(wstart + KERNEL_W, width); - hstart = max(hstart, (int)0); - wstart = max(wstart, (int)0); - Dtype maxval = -FLT_MAX; - int maxidx = -1; - __global const Dtype* bottom_slice = bottom_data - + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_slice[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_slice[maxidx]; + int off_y = hstart + h; + if (off_y >= 0 && off_y < height) + { + for (int w = 0; w < KERNEL_W; ++w) + { + int off_x = wstart + w; + if (off_x >= 0 && off_x < width) + { + Dtype val = bottom_data[in_offset + off_y * width + off_x]; + maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx; + maxval = fmax(val, maxval); } } } - top_data[index] = maxval; + } + top_data[index] = maxval; #ifdef HAVE_MASK - mask[index] = maxidx; + mask[index] = maxidx; #endif - } } #elif defined KERNEL_AVE_POOL @@ -105,43 +109,42 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)( const int pooled_height, const int pooled_width, __global Dtype* top_data) { - for (int index = get_global_id(0); index < nthreads; - index += get_global_size(0)) - { - { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * STRIDE_H - PAD_H; - int wstart = pw * STRIDE_W - PAD_W; - int hend = min(hstart + KERNEL_H, height + PAD_H); - int wend = min(wstart + KERNEL_W, width + PAD_W); - int pool_size; + int index = get_global_id(0); + if (index >= nthreads) + return; + + const int pw = index % pooled_width; + const int xx = index / pooled_width; + const int ph = xx % pooled_height; + const int ch = xx / pooled_height; + int hstart = ph * STRIDE_H - PAD_H; + int wstart = pw * STRIDE_W - PAD_W; + int hend = min(hstart + KERNEL_H, height + PAD_H); + int wend = min(wstart + KERNEL_W, width + PAD_W); + int pool_size; #ifdef AVE_POOL_PADDING_AREA - pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, (int)0); - wstart = max(wstart, (int)0); - hend = min(hend, height); - wend = min(wend, width); + pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + hend = min(hend, height); + wend = min(wend, width); #else - hstart = max(hstart, (int)0); - wstart = max(wstart, (int)0); - hend = min(hend, height); - wend = min(wend, width); - pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + hend = min(hend, height); + wend = min(wend, width); + pool_size = (hend - hstart) * (wend - wstart); #endif - Dtype aveval = 0; - __global const Dtype* bottom_slice = bottom_data - + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * width + w]; - } - } - top_data[index] = aveval / pool_size; + Dtype aveval = 0; + int in_offset = ch * height * width; + for (int h = hstart; h < hend; ++h) + { + for (int w = wstart; w < wend; ++w) + { + aveval += bottom_data[in_offset + h * width + w]; } } + top_data[index] = aveval / pool_size; } #elif defined KERNEL_STO_POOL