From bd5e4c6c49c4f4b599aa60a295d25b8e0e2c0930 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 14 May 2014 18:56:25 +0400 Subject: [PATCH] other kernels now use row scheme --- modules/core/src/arithm.cpp | 11 +++---- modules/core/src/convert.cpp | 42 ++++++++++++++------------ modules/core/src/dxt.cpp | 7 +++-- modules/core/src/matrix.cpp | 8 +++-- modules/core/src/opencl/inrange.cl | 50 ++++++++++++++++++------------- modules/core/src/opencl/mixchannels.cl | 22 +++++++++----- modules/core/src/opencl/mulspectrums.cl | 28 ++++++++++-------- modules/core/src/opencl/set_identity.cl | 11 +++---- modules/core/src/opencl/split_merge.cl | 52 +++++++++++++++++++++------------ 9 files changed, 140 insertions(+), 91 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 87e2514..98d9567 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3094,11 +3094,12 @@ static InRangeFunc getInRangeFunc(int depth) static bool ocl_inRange( InputArray _src, InputArray _lowerb, InputArray _upperb, OutputArray _dst ) { + const ocl::Device & d = ocl::Device::getDefault(); int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind(); Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size(); int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type(); int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype); - int cn = CV_MAT_CN(stype); + int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1; bool lbScalar = false, ubScalar = false; if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) || @@ -3122,7 +3123,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb, if (lbScalar != ubScalar) return false; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, + bool doubleSupport = d.doubleFPConfig() > 0, haveScalar = lbScalar && ubScalar; if ( (!doubleSupport && sdepth == CV_64F) || @@ -3187,13 +3188,13 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb, uscalar.copyTo(uscalaru); ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru), - ocl::KernelArg::PtrReadOnly(uscalaru)); + ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI); } else ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru), - ocl::KernelArg::ReadOnlyNoSize(uscalaru)); + ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI); - size_t globalsize[2] = { ssize.width, ssize.height }; + size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI }; return ker.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 7fc3176..256b598 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -270,21 +270,22 @@ namespace cv { static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) { - int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; - String dstargs, dstdecl, processelem; + String dstargs, processelem, indexdecl; for (int i = 0; i < cn; ++i) { dstargs += format("DECLARE_DST_PARAM(%d)", i); - dstdecl += format("DECLARE_DATA(%d)", i); + indexdecl += format("DECLARE_INDEX(%d)", i); processelem += format("PROCESS_ELEM(%d)", i); } ocl::Kernel k("split", ocl::core::split_merge_oclsrc, - format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s " - "-D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s", + format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s" + " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s", ocl::memopTypeToStr(depth), cn, dstargs.c_str(), - dstdecl.c_str(), processelem.c_str())); + processelem.c_str(), indexdecl.c_str())); if (k.empty()) return false; @@ -299,8 +300,9 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); for (int i = 0; i < cn; ++i) argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); + k.set(argidx, rowsPerWI); - size_t globalsize[2] = { size.width, size.height }; + size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -419,7 +421,8 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst ) _mv.getUMatVector(src); CV_Assert(!src.empty()); - int type = src[0].type(), depth = CV_MAT_DEPTH(type); + int type = src[0].type(), depth = CV_MAT_DEPTH(type), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; Size size = src[0].size(); for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i) @@ -440,20 +443,20 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst ) } int dcn = (int)ksrc.size(); - String srcargs, srcdecl, processelem, cndecl; + String srcargs, processelem, cndecl, indexdecl; for (int i = 0; i < dcn; ++i) { srcargs += format("DECLARE_SRC_PARAM(%d)", i); - srcdecl += format("DECLARE_DATA(%d)", i); processelem += format("PROCESS_ELEM(%d)", i); + indexdecl += format("DECLARE_INDEX(%d)", i); cndecl += format(" -D scn%d=%d", i, ksrc[i].channels()); } ocl::Kernel k("merge", ocl::core::split_merge_oclsrc, format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s" - " -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s%s", + " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s", dcn, ocl::memopTypeToStr(depth), srcargs.c_str(), - srcdecl.c_str(), processelem.c_str(), cndecl.c_str())); + indexdecl.c_str(), processelem.c_str(), cndecl.c_str())); if (k.empty()) return false; @@ -463,9 +466,10 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst ) int argidx = 0; for (int i = 0; i < dcn; ++i) argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i])); - k.set(argidx, ocl::KernelArg::WriteOnly(dst)); + argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst)); + k.set(argidx, rowsPerWI); - size_t globalsize[2] = { dst.cols, dst.rows }; + size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -690,7 +694,7 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d for (size_t i = 0, dsize = dst.size(); i < dsize; ++i) CV_Assert(dst[i].size() == size && dst[i].depth() == depth); - String declsrc, decldst, declproc, declcn; + String declsrc, decldst, declproc, declcn, indexdecl; std::vector srcargs(npairs), dstargs(npairs); for (size_t i = 0; i < npairs; ++i) @@ -711,14 +715,16 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d declsrc += format("DECLARE_INPUT_MAT(%d)", i); decldst += format("DECLARE_OUTPUT_MAT(%d)", i); + indexdecl += format("DECLARE_INDEX(%d)", i); declproc += format("PROCESS_ELEM(%d)", i); declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels()); } ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc, - format("-D T=%s -D DECLARE_INPUT_MATS=%s -D DECLARE_OUTPUT_MATS=%s" - " -D PROCESS_ELEMS=%s%s", ocl::memopTypeToStr(depth), - declsrc.c_str(), decldst.c_str(), declproc.c_str(), declcn.c_str())); + format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s" + " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s", + ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(), + declproc.c_str(), indexdecl.c_str(), declcn.c_str())); if (k.empty()) return false; diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index d4ece37..2a08899 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2489,7 +2489,8 @@ namespace cv { static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB ) { - int atype = _srcA.type(), btype = _srcB.type(); + int atype = _srcA.type(), btype = _srcB.type(), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; Size asize = _srcA.size(), bsize = _srcB.size(); CV_Assert(asize == bsize); @@ -2509,9 +2510,9 @@ static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB, return false; k.args(ocl::KernelArg::ReadOnlyNoSize(A), ocl::KernelArg::ReadOnlyNoSize(B), - ocl::KernelArg::WriteOnly(dst)); + ocl::KernelArg::WriteOnly(dst), rowsPerWI); - size_t globalsize[2] = { asize.width, asize.height }; + size_t globalsize[2] = { asize.width, (asize.height + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 0b99872..e285315 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2742,7 +2742,8 @@ namespace cv { static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s ) { int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), - sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn); + sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc, format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type), @@ -2751,9 +2752,10 @@ static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s ) return false; UMat m = _m.getUMat(); - k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s))); + k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)), + rowsPerWI); - size_t globalsize[2] = { m.cols, m.rows }; + size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/opencl/inrange.cl b/modules/core/src/opencl/inrange.cl index b113859..0de561f 100644 --- a/modules/core/src/opencl/inrange.cl +++ b/modules/core/src/opencl/inrange.cl @@ -52,37 +52,47 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, #ifdef HAVE_SCALAR - __global const T * src2, __global const T * src3 + __global const T * src2, __global const T * src3, #else __global const uchar * src2ptr, int src2_step, int src2_offset, - __global const uchar * src3ptr, int src3_step, int src3_offset + __global const uchar * src3ptr, int src3_step, int src3_offset, #endif - ) + int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset)); - int dst_index = mad24(y, dst_step, x + dst_offset); - __global const T * src1 = (__global const T *)(src1ptr + src1_index); - __global uchar * dst = dstptr + dst_index; + int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset)); + int dst_index = mad24(y0, dst_step, x + dst_offset); +#ifndef HAVE_SCALAR + int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset)); + int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset)); +#endif + for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step) + { + __global const T * src1 = (__global const T *)(src1ptr + src1_index); + __global uchar * dst = dstptr + dst_index; #ifndef HAVE_SCALAR - int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset)); - int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset)); - __global const T * src2 = (__global const T *)(src2ptr + src2_index); - __global const T * src3 = (__global const T *)(src3ptr + src3_index); + __global const T * src2 = (__global const T *)(src2ptr + src2_index); + __global const T * src3 = (__global const T *)(src3ptr + src3_index); #endif - dst[0] = 255; + dst[0] = 255; + + for (int c = 0; c < cn; ++c) + if (src2[c] > src1[c] || src3[c] < src1[c]) + { + dst[0] = 0; + break; + } - for (int c = 0; c < cn; ++c) - if (src2[c] > src1[c] || src3[c] < src1[c]) - { - dst[0] = 0; - break; - } +#ifndef HAVE_SCALAR + src2_index += src2_step; + src3_index += src3_step; +#endif + } } } diff --git a/modules/core/src/opencl/mixchannels.cl b/modules/core/src/opencl/mixchannels.cl index bede20c..095b44a 100644 --- a/modules/core/src/opencl/mixchannels.cl +++ b/modules/core/src/opencl/mixchannels.cl @@ -45,20 +45,28 @@ __global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset, #define DECLARE_OUTPUT_MAT(i) \ __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset, +#define DECLARE_INDEX(i) \ + int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \ + int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); #define PROCESS_ELEM(i) \ - int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \ __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \ - int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \ __global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \ - dst##i[0] = src##i[0]; + dst##i[0] = src##i[0]; \ + src##i##_index += src##i##_step; \ + dst##i##_index += dst##i##_step; -__kernel void mixChannels(DECLARE_INPUT_MATS DECLARE_OUTPUT_MATS int rows, int cols) +__kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - PROCESS_ELEMS + DECLARE_INDEX_N + + for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y) + { + PROCESS_ELEM_N + } } } diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl index 817331e..d921bd9 100644 --- a/modules/core/src/opencl/mulspectrums.cl +++ b/modules/core/src/opencl/mulspectrums.cl @@ -56,26 +56,30 @@ inline float2 conjf(float2 a) __kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset, __global const uchar * src2ptr, int src2_step, int src2_offset, __global uchar * dstptr, int dst_step, int dst_offset, - int dst_rows, int dst_cols) + int dst_rows, int dst_cols, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset)); - int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset)); - int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset)); + int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset)); + int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset)); + int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset)); - float2 src0 = *(__global const float2 *)(src1ptr + src1_index); - float2 src1 = *(__global const float2 *)(src2ptr + src2_index); - __global float2 * dst = (__global float2 *)(dstptr + dst_index); + for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, + src1_index += src1_step, src2_index += src2_step, dst_index += dst_step) + { + float2 src0 = *(__global const float2 *)(src1ptr + src1_index); + float2 src1 = *(__global const float2 *)(src2ptr + src2_index); + __global float2 * dst = (__global float2 *)(dstptr + dst_index); #ifdef CONJ - float2 v = cmulf(src0, conjf(src1)); + float2 v = cmulf(src0, conjf(src1)); #else - float2 v = cmulf(src0, src1); + float2 v = cmulf(src0, src1); #endif - dst[0] = v; + dst[0] = v; + } } } diff --git a/modules/core/src/opencl/set_identity.cl b/modules/core/src/opencl/set_identity.cl index 0e8f142..6b277fe 100644 --- a/modules/core/src/opencl/set_identity.cl +++ b/modules/core/src/opencl/set_identity.cl @@ -56,15 +56,16 @@ #endif __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols, - ST scalar_) + ST scalar_, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); + int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); - storepix(x == y ? scalar : (T)(0), srcptr + src_index); + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step) + storepix(x == y ? scalar : (T)(0), srcptr + src_index); } } diff --git a/modules/core/src/opencl/split_merge.cl b/modules/core/src/opencl/split_merge.cl index 7d27705..907580f 100644 --- a/modules/core/src/opencl/split_merge.cl +++ b/modules/core/src/opencl/split_merge.cl @@ -44,42 +44,58 @@ #ifdef OP_MERGE #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset, -#define DECLARE_DATA(index) __global const T * src##index = \ - (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset))); -#define PROCESS_ELEM(index) dst[index] = src##index[0]; +#define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset)); +#define PROCESS_ELEM(index) \ + __global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \ + dst[index] = src##index[0]; \ + src##index##_index += src##index##_step; __kernel void merge(DECLARE_SRC_PARAMS_N __global uchar * dstptr, int dst_step, int dst_offset, - int rows, int cols) + int rows, int cols, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - DECLARE_DATA_N - __global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset))); - PROCESS_ELEMS_N + DECLARE_INDEX_N + int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset)); + + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step) + { + __global T * dst = (__global T *)(dstptr + dst_index); + + PROCESS_ELEMS_N + } } } #elif defined OP_SPLIT #define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset -#define DECLARE_DATA(index) __global T * dst##index = \ - (__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset))); -#define PROCESS_ELEM(index) dst##index[0] = src[index]; +#define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset)); +#define PROCESS_ELEM(index) \ + __global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \ + dst##index[0] = src[index]; \ + dst##index##_index += dst##index##_step; -__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS) +__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - DECLARE_DATA_N - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset))); - PROCESS_ELEMS_N + DECLARE_INDEX_N + int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset)); + + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step) + { + __global const T * src = (__global const T *)(srcptr + src_index); + + PROCESS_ELEMS_N + } } } -- 2.7.4