From c424d36041ad3be7f10a9a3959d8a82087164c57 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 12 Jun 2014 14:30:50 +0400 Subject: [PATCH] optimized cv::boxFilter --- modules/imgproc/src/opencl/boxFilterSmall.cl | 305 +++++++++++++++++++++++++++ modules/imgproc/src/smooth.cpp | 143 +++++++++---- 2 files changed, 411 insertions(+), 37 deletions(-) create mode 100755 modules/imgproc/src/opencl/boxFilterSmall.cl diff --git a/modules/imgproc/src/opencl/boxFilterSmall.cl b/modules/imgproc/src/opencl/boxFilterSmall.cl new file mode 100755 index 0000000..ff47d18 --- /dev/null +++ b/modules/imgproc/src/opencl/boxFilterSmall.cl @@ -0,0 +1,305 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Itseez, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifdef BORDER_REPLICATE +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT_101 +//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) +#endif + +//blur function does not support BORDER_WRAP +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) +#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#endif + +#ifdef BORDER_ISOLATED +#define ISOLATED_MIN(VAL) (VAL) +#else +#define ISOLATED_MIN(VAL) 0 +#endif + +#ifdef EXTRA_EXTRAPOLATION // border > src image size +#ifdef BORDER_CONSTANT +// None +#elif defined BORDER_REPLICATE +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + x = max(min(x, maxX - 1), minX); \ + y = max(min(y, maxY - 1), minY); \ + } +#elif defined BORDER_WRAP +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + if (x < minX) \ + x -= ((x - maxX + 1) / maxX) * maxX; \ + if (x >= maxX) \ + x %= maxX; \ + if (y < minY) \ + y -= ((y - maxY + 1) / maxY) * maxY; \ + if (y >= maxY) \ + y %= maxY; \ + } +#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) +#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \ + { \ + if (maxX - minX == 1) \ + x = minX; \ + else \ + do \ + { \ + if (x < minX) \ + x = minX - (x - minX) - 1 + delta; \ + else \ + x = maxX - 1 - (x - maxX) - delta; \ + } \ + while (x >= maxX || x < minX); \ + \ + if (maxY - minY == 1) \ + y = minY; \ + else \ + do \ + { \ + if (y < minY) \ + y = minY - (y - minY) - 1 + delta; \ + else \ + y = maxY - 1 - (y - maxY) - delta; \ + } \ + while (y >= maxY || y < minY); \ + } +#ifdef BORDER_REFLECT +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0) +#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101) +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1) +#endif +#else +#error No extrapolation method +#endif +#else +#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ + { \ + int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \ + _row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \ + _row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \ + y = _row + ISOLATED_MIN(minY); \ + \ + _col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \ + _col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \ + x = _col + ISOLATED_MIN(minX); \ + } +#endif + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + +#if cn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE (int)sizeof(srcT1) * cn +#define DSTSIZE (int)sizeof(dstT1) * cn +#endif + +#define noconvert + +struct RectCoords +{ + int x1, y1, x2, y2; +}; + +#ifdef BORDER_ISOLATED +inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels) +{ + return coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2; +} +#else +inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels) +{ + return coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2; +} +#endif + +inline WT getBorderPixel(const struct RectCoords bounds, int2 coord, + __global const uchar * srcptr, int srcstep) +{ +#ifdef BORDER_CONSTANT + return (WT)(0); +#else + int selected_col = coord.x; + int selected_row = coord.y; + + EXTRAPOLATE(selected_col, selected_row, + bounds.x1, bounds.y1, + bounds.x2, bounds.y2); + + __global const uchar* ptr = srcptr + mad24(selected_row, srcstep, selected_col * SRCSIZE); + return convertToWT(loadpix(ptr)); +#endif +} + +inline WT readSrcPixelSingle(int2 pos, __global const uchar * srcptr, + int srcstep, const struct RectCoords srcCoords) +{ + if (!isBorder(srcCoords, pos, 1)) + { + __global const uchar * ptr = srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE); + return convertToWT(loadpix(ptr)); + } + else + return getBorderPixel(srcCoords, pos, srcptr, srcstep); +} + +#define __CAT(x, y) x##y +#define CAT(x, y) __CAT(x, y) + +#define vload1(OFFSET, PTR) (*(PTR + OFFSET)) +#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE) +#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE) +#define PX_LOAD_FLOAT_VEC_CONV CAT(convert_, PX_LOAD_FLOAT_VEC_TYPE) +#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE) +#define float1 float + +inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar * srcptr, + int srcstep, const struct RectCoords srcCoords) +{ + __global const srcT1 * ptr = (__global const srcT1 *) + (srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE)); + return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr)); +} + +// Macros to ensure unrolled loops +#define LOOP1(VAR, STMT) (STMT); (VAR)++; +#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; +#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; +#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; +#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; +#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; +#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; +#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; +#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; +#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; +#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; +#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; +#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; + +#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) + +__kernel void boxFilterSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols +#ifdef NORMALIZE + , float alpha +#endif + ) +{ + // for non-isolated border: offsetX, offsetY, wholeX, wholeY + const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; + + const int startX = get_global_id(0) * PX_PER_WI_X; + const int startY = get_global_id(1) * PX_PER_WI_Y; + + if (startX >= cols || startY >= rows) + return; + + WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH]; + + // Load all of the pixels needed for the calculation + int py = 0; + LOOP(PX_LOAD_Y_ITERATIONS, py, + { + int y = startY + py; + int px = 0; + LOOP(PX_LOAD_X_ITERATIONS, px, + { + int x = startX + (px * PX_LOAD_NUM_PX); + int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y); + + if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX)) + { + PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords); +#ifdef SQR + *((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p * p; +#else + *((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p; +#endif + } + else + { + int lx = 0; + LOOP(PX_LOAD_NUM_PX, lx, + { + WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords); +#ifdef SQR + *((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p * p; +#else + *((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p; +#endif + srcPos.x++; + }); + } + }); + }); + + // Use the stored pixels to compute the results + py = 0; + LOOP(PX_PER_WI_Y, py, + { + int y = startY + py; + int px = 0; + LOOP(PX_PER_WI_X, px, + { + int x = startX + px; + int sy = 0; + int kernelIndex = 0; + WT total_sum = (WT)(0); + + LOOP(KERNEL_SIZE_Y, sy, + { + int sx = 0; + LOOP(KERNEL_SIZE_X, sx, + { + total_sum += privateData[py + sy][px + sx]; + }); + }); + + __global dstT * dstPtr = (__global dstT *)(dstptr + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); +#ifdef NORMALIZE + total_sum *= (WT)(alpha); +#endif + storepix(convertToDstT(total_sum), dstPtr); + }); + }); +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2b212b4..66ff429 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -629,12 +629,14 @@ struct ColumnSum : #ifdef HAVE_OPENCL #define DIVUP(total, grain) ((total + grain - 1) / (grain)) +#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) { + const ocl::Device & dev = ocl::Device::getDefault(); int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + bool doubleSupport = dev.doubleFPConfig() > 0; if (ddepth < 0) ddepth = sdepth; @@ -653,11 +655,12 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, Size size = _src.size(), wholeSize; bool isolated = (borderType & BORDER_ISOLATED) != 0; borderType &= ~BORDER_ISOLATED; - int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)); + int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), + wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; size_t globalsize[2] = { size.width, size.height }; - size_t localsize[2] = { 0, 1 }; + size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; UMat src = _src.getUMat(); if (!isolated) @@ -674,46 +677,110 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, int tryWorkItems = (int)maxWorkItemSizes[0]; ocl::Kernel kernel; - for ( ; ; ) - { - int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); - - while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) - BLOCK_SIZE_X /= 2; - while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) - BLOCK_SIZE_Y *= 2; - if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) + if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && + ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || + (ksize.width == 5 && ksize.height == 5 && cn == 1))) + { + if (w < ksize.width || h < ksize.height) return false; - char cvt[2][50]; - String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" - " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" - " -D ST1=%s -D DT1=%s -D cn=%d", - BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), - anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], - isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); - - localsize[0] = BLOCK_SIZE_X; - globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; - globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y); - - kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); - if (kernel.empty()) - return false; + // Figure out what vector size to use for loading the pixels. + int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; + int pxLoadVecSize = cn * pxLoadNumPixels; - size_t kernelWorkGroupSize = kernel.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) + // Figure out how many pixels per work item to compute in X and Y + // directions. Too many and we run out of registers. + int pxPerWorkItemX = 1, pxPerWorkItemY = 1; + if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) + { + pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) + { + pxPerWorkItemX = size.width % 2 ? 1 : 2; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + globalsize[0] = size.width / pxPerWorkItemX; + globalsize[1] = size.height / pxPerWorkItemY; + + // Need some padding in the private array for pixels + int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); + + // Make the global size a nice round number so the runtime can pick + // from reasonable choices for the workgroup size + const int wgRound = 256; + globalsize[0] = ROUNDUP(globalsize[0], wgRound); + + char build_options[1024], cvt[2][40]; + sprintf(build_options, "-D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " + "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " + "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s%s%s", + cn, anchor.x, anchor.y, ksize.width, ksize.height, + pxLoadVecSize, pxLoadNumPixels, + pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : ""); + + + + if (!kernel.create("boxFilterSmall", cv::ocl::imgproc::boxFilterSmall_oclsrc, build_options)) return false; + } + else + { + localsize = localsize_general; + for ( ; ; ) + { + int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); + + while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) + BLOCK_SIZE_X /= 2; + while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) + BLOCK_SIZE_Y *= 2; + + if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) + return false; + + char cvt[2][50]; + String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" + " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" + " -D ST1=%s -D DT1=%s -D cn=%d", + BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), + anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], + isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); + + localsize[0] = BLOCK_SIZE_X; + globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; + globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y); + + kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); + if (kernel.empty()) + return false; + + size_t kernelWorkGroupSize = kernel.workGroupSize(); + if (localsize[0] <= kernelWorkGroupSize) + break; + if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) + return false; - tryWorkItems = (int)kernelWorkGroupSize; + tryWorkItems = (int)kernelWorkGroupSize; + } } _dst.create(size, CV_MAKETYPE(ddepth, cn)); @@ -736,6 +803,8 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, return kernel.run(2, globalsize, localsize, false); } +#undef ROUNDUP + #endif } -- 2.7.4