From 7f818e9bc3f24f7c9452a3ba4fb0791709fb9a66 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 30 May 2014 18:01:49 +0400 Subject: [PATCH] optimized UMat::copyTo with mask --- modules/core/src/opencl/copyset.cl | 27 ++++++++++++++++++++------- modules/core/src/umatrix.cpp | 17 +++++++++++------ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl index 42796ea..5d346d4 100644 --- a/modules/core/src/opencl/copyset.cl +++ b/modules/core/src/opencl/copyset.cl @@ -44,14 +44,14 @@ #ifdef COPY_TO_MASK #define DEFINE_DATA \ - int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T) * scn, src_offset)); \ - int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T) * scn, dst_offset)); \ + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T1) * scn, src_offset)); \ + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T1) * scn, dst_offset)); \ \ - __global const T * src = (__global const T *)(srcptr + src_index); \ - __global T * dst = (__global T *)(dstptr + dst_index) + __global const T1 * src = (__global const T1 *)(srcptr + src_index); \ + __global T1 * dst = (__global T1 *)(dstptr + dst_index) __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_offset, - __global const uchar * maskptr, int mask_step, int mask_offset, + __global const uchar * mask, int mask_step, int mask_offset, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols) { @@ -60,8 +60,7 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of if (x < dst_cols && y < dst_rows) { - int mask_index = mad24(y, mask_step, mad24(x, mcn, mask_offset)); - __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); + mask += mad24(y, mask_step, mad24(x, mcn, mask_offset)); #if mcn == 1 if (mask[0]) @@ -72,6 +71,16 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of for (int c = 0; c < scn; ++c) dst[c] = src[c]; } +#ifdef HAVE_DST_UNINIT + else + { + DEFINE_DATA; + + #pragma unroll + for (int c = 0; c < scn; ++c) + dst[c] = (T1)(0); + } +#endif #elif scn == mcn DEFINE_DATA; @@ -79,6 +88,10 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of for (int c = 0; c < scn; ++c) if (mask[c]) dst[c] = src[c]; +#ifdef HAVE_DST_UNINIT + else + dst[c] = (T1)(0); +#endif #else #error "(mcn == 1 || mcn == scn) should be true" #endif diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 0060492..0f14108 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -678,16 +678,21 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const UMat dst = _dst.getUMat(); + bool haveDstUninit = false; if( prevu != dst.u ) // do not leave dst uninitialized - dst = Scalar(0); + haveDstUninit = true; - ocl::Kernel k("copyToMask", ocl::core::copyset_oclsrc, - format("-D COPY_TO_MASK -D T=%s -D scn=%d -D mcn=%d", - ocl::memopTypeToStr(depth()), cn, mcn)); + String opts = format("-D COPY_TO_MASK -D T1=%s -D scn=%d -D mcn=%d%s", + ocl::memopTypeToStr(depth()), cn, mcn, + haveDstUninit ? " -D HAVE_DST_UNINIT" : ""); + + ocl::Kernel k("copyToMask", ocl::core::copyset_oclsrc, opts); if (!k.empty()) { - k.args(ocl::KernelArg::ReadOnlyNoSize(*this), ocl::KernelArg::ReadOnlyNoSize(_mask.getUMat()), - ocl::KernelArg::WriteOnly(dst)); + k.args(ocl::KernelArg::ReadOnlyNoSize(*this), + ocl::KernelArg::ReadOnlyNoSize(_mask.getUMat()), + haveDstUninit ? ocl::KernelArg::WriteOnly(dst) : + ocl::KernelArg::ReadWrite(dst)); size_t globalsize[2] = { cols, rows }; if (k.run(2, globalsize, NULL, false)) -- 2.7.4