From c83455d8a42c03d4ed213238ebd4d6758ffbcb92 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 23 May 2014 12:45:24 +0300 Subject: [PATCH] optimized cv::repeat --- modules/core/src/copy.cpp | 28 ++++++++++++++++------- modules/core/src/ocl.cpp | 4 ++-- modules/core/src/opencl/repeat.cl | 47 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 modules/core/src/opencl/repeat.cl diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index be93577..4fbc593 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -758,16 +758,28 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode ) static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst) { + if (ny == 1 && nx == 1) + { + _src.copyTo(_dst); + return true; + } + + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1, + kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4); + + ocl::Kernel k("repeat", ocl::core::repeat_oclsrc, + format("-D T=%s -D nx=%d -D ny=%d -D rowsPerWI=%d -D cn=%d", + ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)), + nx, ny, rowsPerWI, kercn)); + if (k.empty()) + return false; + UMat src = _src.getUMat(), dst = _dst.getUMat(); + k.args(ocl::KernelArg::ReadOnly(src, cn, kercn), ocl::KernelArg::WriteOnlyNoSize(dst)); - for (int y = 0; y < ny; ++y) - for (int x = 0; x < nx; ++x) - { - Rect roi(x * src.cols, y * src.rows, src.cols, src.rows); - UMat hdr(dst, roi); - src.copyTo(hdr); - } - return true; + size_t globalsize[] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, NULL, false); } #endif diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 9d6a1b5..96c17c8 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4406,8 +4406,8 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name) CV_Assert(src.isMat() || src.isUMat()); \ int ctype = src.type(), ccn = CV_MAT_CN(ctype); \ Size csize = src.size(); \ - cols.push_back(ccn * src.size().width); \ - if (ctype != type || csize != ssize) \ + cols.push_back(ccn * csize.width); \ + if (ctype != type) \ return 1; \ offsets.push_back(src.offset()); \ steps.push_back(src.step()); \ diff --git a/modules/core/src/opencl/repeat.cl b/modules/core/src/opencl/repeat.cl new file mode 100644 index 0000000..21be121 --- /dev/null +++ b/modules/core/src/opencl/repeat.cl @@ -0,0 +1,47 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Itseez, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define storepix(val, addr) *(__global T *)(addr) = val +#define TSIZE (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#endif + +__kernel void repeat(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar * dstptr, int dst_step, int dst_offset) +{ + int x = get_global_id(0); + int y0 = get_global_id(1) * rowsPerWI; + + if (x < src_cols) + { + int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(T), src_offset)); + int dst_index0 = mad24(y0, dst_step, mad24(x, (int)sizeof(T), dst_offset)); + + for (int y = y0, y1 = min(src_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index0 += dst_step) + { + T srcelem = loadpix(srcptr + src_index); + + #pragma unroll + for (int ey = 0; ey < ny; ++ey) + { + int dst_index = mad24(ey * src_rows, dst_step, dst_index0); + + #pragma unroll + for (int ex = 0; ex < nx; ++ex) + { + storepix(srcelem, dstptr + dst_index); + dst_index = mad24(src_cols, (int)sizeof(T), dst_index); + } + } + } + } +} -- 2.7.4