From 396921dd239a05738308f07426c81a312e83f28c Mon Sep 17 00:00:00 2001 From: Li Peng Date: Wed, 30 Nov 2016 17:06:05 +0800 Subject: [PATCH] 5x5 gaussian blur optimization Add new 5x5 gaussian blur kernel for CV_8UC1 format, it is 50% ~ 70% faster than current ocl kernel in the perf test. Signed-off-by: Li Peng --- modules/imgproc/src/opencl/gaussianBlur5x5.cl | 198 ++++++++++++++++++++++++++ modules/imgproc/src/smooth.cpp | 32 +++-- modules/imgproc/test/ocl/test_filters.cpp | 23 +-- 3 files changed, 237 insertions(+), 16 deletions(-) create mode 100644 modules/imgproc/src/opencl/gaussianBlur5x5.cl diff --git a/modules/imgproc/src/opencl/gaussianBlur5x5.cl b/modules/imgproc/src/opencl/gaussianBlur5x5.cl new file mode 100644 index 0000000..dc0b15f --- /dev/null +++ b/modules/imgproc/src/opencl/gaussianBlur5x5.cl @@ -0,0 +1,198 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#define DIG(a) a, +__constant float kx[] = { KERNEL_MATRIX_X }; +__constant float ky[] = { KERNEL_MATRIX_Y }; + +#define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x]) + +#define FILL_ARR(s1, s2, n, e1, e2) \ + arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \ + arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234); \ + arr[5 * n + 2] = (uchar4)(line[n].s2345); \ + arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456); \ + arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567); + +__kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step, + __global uint* dst, int dst_step, int rows, int cols) +{ + int x = get_global_id(0) * 4; + int y = get_global_id(1); + + if (x >= cols || y >= rows) return; + + uchar8 line[5]; + int offset, src_index; + + src_index = x + (y - 2) * src_step - 2; + offset = max(0, src_index + 2 * src_step); + line[2] = vload8(0, src + offset); + if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45); + +#if defined BORDER_CONSTANT || defined BORDER_REPLICATE + uchar8 tmp; +#ifdef BORDER_CONSTANT + tmp = (uchar8)0; +#elif defined BORDER_REPLICATE + tmp = line[2]; +#endif + line[0] = line[1] = tmp; + if (y > 1) + { + offset = max(0, src_index); + line[0] = vload8(0, src + offset); + if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); + } + + if (y > 0) + { + offset = max(0, src_index + src_step); + line[1] = vload8(0, src + offset); + if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45); + } + + line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step); + line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step); +#elif BORDER_REFLECT + int t; + t = (y <= 1) ? (abs(y - 1) - y + 2) : 0; + offset = max(0, src_index + t * src_step); + line[0] = vload8(0, src + offset); + if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); + + if (y == 0) + line[1] = line[2]; + else + { + offset = max(0, src_index + 1 * src_step); + line[1] = vload8(0, src + offset); + if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45); + } + + line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step); + + t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4; + line[4] = vload8(0, src + src_index + t * src_step); +#elif BORDER_REFLECT_101 + if (y == 1) + line[0] = line[2]; + else + { + offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index); + line[0] = vload8(0, src + offset); + if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); + } + + offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step); + line[1] = vload8(0, src + offset); + if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45); + + line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step); + if (y == (rows - 2)) + line[4] = line[2]; + else + { + line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step); + } +#endif + + bool row_s = (x == 0); + bool row_e = ((x + 4) == cols); + uchar4 arr[25]; + uchar s, e; + +#ifdef BORDER_CONSTANT + s = e = 0; + + FILL_ARR(s, s, 0, e, e); + FILL_ARR(s, s, 1, e, e); + FILL_ARR(s, s, 2, e, e); + FILL_ARR(s, s, 3, e, e); + FILL_ARR(s, s, 4, e, e); +#elif defined BORDER_REPLICATE + s = line[0].s2; + e = line[0].s5; + FILL_ARR(s, s, 0, e, e); + + s = line[1].s2; + e = line[1].s5; + FILL_ARR(s, s, 1, e, e); + + s = line[2].s2; + e = line[2].s5; + FILL_ARR(s, s, 2, e, e); + + s = line[3].s2; + e = line[3].s5; + FILL_ARR(s, s, 3, e, e); + + s = line[4].s2; + e = line[4].s5; + FILL_ARR(s, s, 4, e, e); +#elif BORDER_REFLECT + uchar s1, s2; + uchar e1, e2; + + s1 = line[0].s3; + s2 = line[0].s2; + e1 = line[0].s5; + e2 = line[0].s4; + FILL_ARR(s1, s2, 0, e1, e2); + + s1 = line[1].s3; + s2 = line[1].s2; + e1 = line[1].s5; + e2 = line[1].s4; + FILL_ARR(s1, s2, 1, e1, e2); + + s1 = line[2].s3; + s2 = line[2].s2; + e1 = line[2].s5; + e2 = line[2].s4; + FILL_ARR(s1, s2, 2, e1, e2); + + s1 = line[3].s3; + s2 = line[3].s2; + e1 = line[3].s5; + e2 = line[3].s4; + FILL_ARR(s1, s2, 3, e1, e2); + + s1 = line[4].s3; + s2 = line[4].s2; + e1 = line[4].s5; + e2 = line[4].s4; + FILL_ARR(s1, s2, 4, e1, e2); +#elif BORDER_REFLECT_101 + s = line[0].s4; + e = line[0].s3; + FILL_ARR(s, e, 0, s, e); + + s = line[1].s4; + e = line[1].s3; + FILL_ARR(s, e, 1, s, e); + + s = line[2].s4; + e = line[2].s3; + FILL_ARR(s, e, 2, s, e); + + s = line[3].s4; + e = line[3].s3; + FILL_ARR(s, e, 3, s, e); + + s = line[4].s4; + e = line[4].s3; + FILL_ARR(s, e, 4, s, e); +#endif + + float4 sum; + sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) + + OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) + + OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) + + OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) + + OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4); + + int dst_index = (x / 4) + y * (dst_step / 4); + dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum)); +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2d3c984..552ced0 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2135,15 +2135,16 @@ namespace cv { #ifdef HAVE_OPENCL -static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, int borderType) +static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, + InputArray _kernelX, InputArray _kernelY, int borderType) { const ocl::Device & dev = ocl::Device::getDefault(); int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); if ( !(dev.isIntel() && (type == CV_8UC1) && (_src.offset() == 0) && (_src.step() % 4 == 0) && - (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) ) + ((ksize.width == 5 && (_src.cols() % 4 == 0)) || + (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) ) return false; Mat kernelX = _kernelX.getMat().reshape(1, 1); @@ -2160,8 +2161,16 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep size_t globalsize[2] = { 0, 0 }; size_t localsize[2] = { 0, 0 }; - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; + if (ksize.width == 3) + { + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + } + else if (ksize.width == 5) + { + globalsize[0] = size.width / 4; + globalsize[1] = size.height / 1; + } const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; char build_opts[1024]; @@ -2169,7 +2178,13 @@ static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddep ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); - ocl::Kernel kernel("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); + ocl::Kernel kernel; + + if (ksize.width == 3) + kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); + else if (ksize.width == 5) + kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts); + if (kernel.empty()) return false; @@ -2436,9 +2451,10 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && - ksize.width == 3 && ksize.height == 3 && + ((ksize.width == 3 && ksize.height == 3) || + (ksize.width == 5 && ksize.height == 5)) && (size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(), - ocl_GaussianBlur3x3_8UC1(_src, _dst, CV_MAT_DEPTH(type), kx, ky, borderType)); + ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType ); } diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index 434b776..481edf2 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -342,7 +342,7 @@ OCL_TEST_P(GaussianBlurTest, Mat) } } -PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType, +PARAM_TEST_CASE(GaussianBlur_multicols_Base, MatType, int, // kernel size Size, // dx, dy BorderType, // border type @@ -372,11 +372,18 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType, void random_roi() { - size = Size(3, 3); + size = Size(ksize, ksize); Size roiSize = randomSize(size.width, MAX_VALUE, size.height, MAX_VALUE); - roiSize.width = std::max(size.width + 13, roiSize.width & (~0xf)); - roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1)); + if (ksize == 3) + { + roiSize.width = std::max((size.width + 15) & 0x10, roiSize.width & (~0xf)); + roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1)); + } + else if (ksize == 5) + { + roiSize.width = std::max((size.width + 3) & 0x4, roiSize.width & (~0x3)); + } Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256); @@ -402,9 +409,9 @@ PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType, } }; -typedef GaussianBlur3x3_cols16_rows2_Base GaussianBlur3x3_cols16_rows2; +typedef GaussianBlur_multicols_Base GaussianBlur_multicols; -OCL_TEST_P(GaussianBlur3x3_cols16_rows2, Mat) +OCL_TEST_P(GaussianBlur_multicols, Mat) { Size kernelSize(ksize, ksize); @@ -710,9 +717,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( Bool(), Values(1))); // not used -OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur3x3_cols16_rows2, Combine( +OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur_multicols, Combine( Values((MatType)CV_8UC1), - Values(3), // kernel size + Values(3, 5), // kernel size Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(0.0), // not used -- 2.7.4