From: Elena Gvozdeva Date: Mon, 12 May 2014 12:30:47 +0000 (+0400) Subject: T-API: optimized ocl_flip X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~372^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c7dc8848551568013a8380180354a8e280cfa3fa;p=profile%2Fivi%2Fopencv.git T-API: optimized ocl_flip --- diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index b007f3c..7b9c038 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -610,6 +610,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, #ifdef HAVE_OPENCL +#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS }; static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) @@ -628,9 +629,12 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) else kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH; + ocl::Device dev = ocl::Device::getDefault(); + int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; + ocl::Kernel k(kernelName, ocl::core::flip_oclsrc, - format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type), - ocl::memopTypeToStr(depth), cn)); + format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d", ocl::memopTypeToStr(type), + ocl::memopTypeToStr(depth), cn, pxPerWIy)); if (k.empty()) return false; @@ -645,10 +649,13 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), rows, cols); - size_t maxWorkGroupSize = ocl::Device::getDefault().maxWorkGroupSize(); + size_t maxWorkGroupSize = dev.maxWorkGroupSize(); CV_Assert(maxWorkGroupSize % 4 == 0); + size_t globalsize[2] = { cols, rows }, localsize[2] = { maxWorkGroupSize / 4, 4 }; - return k.run(2, globalsize, flipType == FLIP_COLS ? localsize : NULL, false); + globalsize[1] = DIVUP(globalsize[1], pxPerWIy); + + return k.run(2, globalsize, (flipType == FLIP_COLS) && (!dev.isIntel()) ? localsize : NULL, false); } #endif diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index bacfe7a..c81dd43 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -54,15 +54,19 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1)*PIX_PER_WI_Y; - if (x < cols && y < thread_rows) + if (x < cols) { - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y) + { + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); - storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); + } } } @@ -71,16 +75,20 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1)*PIX_PER_WI_Y; - if (x < cols && y < thread_rows) + if (x < cols) { int x1 = cols - x - 1; - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y) + { + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); - storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + } } } @@ -89,15 +97,19 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1)*PIX_PER_WI_Y; - if (x < thread_cols && y < rows) + if (x < thread_cols) { int x1 = cols - x - 1; - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y) + { + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); - storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + } } }