From 15738bf7ef048de3cf9a216b93167a1ca988906f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 14 May 2014 15:42:30 +0400 Subject: [PATCH] multiple rows in KF kernel --- modules/core/src/arithm.cpp | 29 +++++++------- modules/core/src/convert.cpp | 12 +++--- modules/core/src/mathfuncs.cpp | 54 +++++++++++++++----------- modules/core/src/matmul.cpp | 12 +++--- modules/core/src/opencl/arithm.cl | 82 +++++++++++++++++++++++---------------- modules/core/src/stat.cpp | 27 +++++++------ 6 files changed, 126 insertions(+), 90 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 8e01de1..87e2514 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1008,7 +1008,8 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, int srcdepth = CV_MAT_DEPTH(srctype); int cn = CV_MAT_CN(srctype); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || (!doubleSupport && srcdepth == CV_64F && !bitwise)) return false; @@ -1016,8 +1017,9 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, char opts[1024]; int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int scalarcn = kercn == 3 ? 4 : kercn; + int rowsPerWI = d.isIntel() ? 4 : 1; - sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d", + sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d", haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", @@ -1025,7 +1027,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), - kercn); + kercn, rowsPerWI); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if (k.empty()) @@ -1068,7 +1070,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, src2arg, maskarg, dstarg); } - size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; + size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, 0, false); } @@ -1371,7 +1373,8 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, void* usrdata, int oclop, bool haveScalar ) { - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); bool haveMask = !_mask.empty(); @@ -1388,12 +1391,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, return false; int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - int scalarcn = kercn == 3 ? 4 : kercn; + int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1; char cvtstr[4][32], opts[1024]; sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " - "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d", + "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), @@ -1404,7 +1407,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI); size_t usrdata_esz = CV_ELEM_SIZE(wdepth); const uchar* usrdata_p = (const uchar*)usrdata; @@ -1478,7 +1481,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, src2arg, maskarg, dstarg); } - size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; + size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -2764,7 +2767,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2)) return false; - int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); + int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1; // Workaround for bug with "?:" operator in AMD OpenCL compiler if (depth1 >= CV_16U) kercn = 1; @@ -2775,14 +2778,14 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d" " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s" - " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s", + " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s", haveScalar ? "UNARY_OP" : "BINARY_OP", ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)), ocl::typeToStr(CV_8UC(kercn)), kercn, ocl::convertTypeStr(depth1, CV_8U, kercn, cvt), operationMap[op], ocl::typeToStr(depth1), ocl::typeToStr(depth1), ocl::typeToStr(CV_8U), - ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), + ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); @@ -2839,7 +2842,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in ocl::KernelArg::WriteOnly(dst, cn, kercn)); } - size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index d88e422..7fc3176 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1357,9 +1357,10 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) { + const ocl::Device & d = ocl::Device::getDefault(); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), - kercn = ocl::predictOptimalVectorWidth(_src, _dst); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; if (!doubleSupport && depth == CV_64F) return false; @@ -1368,13 +1369,14 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha int wdepth = std::max(depth, CV_32F); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" - " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s", + " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s" + " -D workT1=%s -D rowsPerWI=%d%s", ocl::typeToStr(CV_8UC(kercn)), ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), - ocl::typeToStr(wdepth), + ocl::typeToStr(wdepth), rowsPerWI, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -1391,7 +1393,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha else if (wdepth == CV_64F) k.args(srcarg, dstarg, alpha, beta); - size_t globalsize[2] = { src.cols * cn / kercn, src.rows }; + size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index f045eec..1893214 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -65,13 +65,15 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in int kercn = oclop == OCL_OP_PHASE_DEGREES || oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device d = ocl::Device::getDefault(); + bool double_support = d.doubleFPConfig() > 0; if (!double_support && depth == CV_64F) return false; + int rowsPerWI = d.isIntel() ? 4 : 1; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP", - oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), + format("-D %s -D %s -D dstT=%s -D rowsPerWI=%d%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP", + oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), rowsPerWI, double_support ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -89,7 +91,7 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in else k.args(src1arg, src2arg, dstarg); - size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; + size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, 0, false); } @@ -524,8 +526,10 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre static bool ocl_cartToPolar( InputArray _src1, InputArray _src2, OutputArray _dst1, OutputArray _dst2, bool angleInDegrees ) { - int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device & d = ocl::Device::getDefault(); + int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 && (depth == CV_32F || depth == CV_64F) && type == _src2.type()) || @@ -533,9 +537,9 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2, return false; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s", + format("-D BINARY_OP -D dstT=%s -D depth=%d -D rowsPerWI=%d -D OP_CTP_%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), - depth, angleInDegrees ? "AD" : "AR", + depth, rowsPerWI, angleInDegrees ? "AD" : "AR", doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -553,7 +557,7 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2, ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2)); - size_t globalsize[2] = { dst1.cols * cn, dst1.rows }; + size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -713,16 +717,18 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, static bool ocl_polarToCart( InputArray _mag, InputArray _angle, OutputArray _dst1, OutputArray _dst2, bool angleInDegrees ) { - int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device & d = ocl::Device::getDefault(); + int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; if ( !doubleSupport && depth == CV_64F ) return false; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s", - ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth, - angleInDegrees ? "AD" : "AR", + format("-D dstT=%s -D rowsPerWI=%d -D depth=%d -D BINARY_OP -D OP_PTC_%s%s", + ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), rowsPerWI, + depth, angleInDegrees ? "AD" : "AR", doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -738,7 +744,7 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle, k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle), ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2)); - size_t globalsize[2] = { dst1.cols * cn, dst1.rows }; + size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -2103,8 +2109,10 @@ static IPowFunc ipowTab[] = static bool ocl_pow(InputArray _src, double power, OutputArray _dst, bool is_ipower, int ipower) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device & d = ocl::Device::getDefault(); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; if (depth == CV_64F && !doubleSupport) return false; @@ -2113,8 +2121,8 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst, const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW"; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth), - op, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + format("-D dstT=%s -D rowsPerWI=%d -D %s -D UNARY_OP%s", ocl::typeToStr(depth), + rowsPerWI, op, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -2137,7 +2145,7 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst, k.args(srcarg, dstarg, power); } - size_t globalsize[2] = { dst.cols * cn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } @@ -2491,8 +2499,10 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma static bool ocl_patchNaNs( InputOutputArray _a, float value ) { + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int")); + format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d", + rowsPerWI)); if (k.empty()) return false; @@ -2502,7 +2512,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value ) k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), (float)value); - size_t globalsize[2] = { a.cols * cn, a.rows }; + size_t globalsize[2] = { a.cols * cn, (a.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index bf1428e..30ce75b 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2153,9 +2153,10 @@ typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, i static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) { + const ocl::Device & d = ocl::Device::getDefault(); int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F), - kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; Size size = _src1.size(); if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) @@ -2164,13 +2165,14 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp char cvt[2][50]; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" - " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s", + " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s" + " -D wdepth=%d%s -D rowsPerWI=%d", ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), ocl::typeToStr(wdepth), wdepth, - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI)); if (k.empty()) return false; @@ -2187,7 +2189,7 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp else k.args(src1arg, src2arg, dstarg, alpha); - size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index 5faf7de..1f15d76 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -145,6 +145,7 @@ #define EXTRA_PARAMS #define EXTRA_INDEX +#define EXTRA_INDEX_ADD #if defined OP_ADD #define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2)) @@ -363,7 +364,9 @@ #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 #undef EXTRA_INDEX - #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2)) + #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2)) + #undef EXTRA_INDEX_ADD + #define EXTRA_INDEX_ADD dst_index2 += dststep2 #endif #if defined UNARY_OP || defined MASK_UNARY_OP @@ -393,18 +396,25 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) - int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); + int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); #endif - int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); + int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); EXTRA_INDEX; - PROCESS_ELEM; + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep) + { + PROCESS_ELEM; +#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) + src2_index += srcstep2; +#endif + EXTRA_INDEX_ADD; + } } } @@ -417,19 +427,21 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - int mask_index = mad24(y, maskstep, x + maskoffset); - if( mask[mask_index] ) - { - int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); - int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); - int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); - - PROCESS_ELEM; - } + int mask_index = mad24(y0, maskstep, x + maskoffset); + int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); + int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); + + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2, + mask_index += maskstep, dst_index += dststep) + if (mask[mask_index]) + { + PROCESS_ELEM; + } } } @@ -440,14 +452,17 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < cols && y < rows) + if (x < cols) { - int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); - int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); + int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); - PROCESS_ELEM; + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep) + { + PROCESS_ELEM; + } } } @@ -459,18 +474,19 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1); - if (x < cols && y < rows) + if (x < cols) { - int mask_index = mad24(y, maskstep, x + maskoffset); - if( mask[mask_index] ) - { - int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); - int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); - - PROCESS_ELEM; - } + int mask_index = mad24(y0, maskstep, x + maskoffset); + int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); + + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep) + if (mask[mask_index]) + { + PROCESS_ELEM; + } } } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 0584496..8877fb5 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1973,8 +1973,9 @@ static NormDiffFunc getNormDiffFunc(int normType, int depth) static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result ) { + const ocl::Device & d = ocl::Device::getDefault(); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, + bool doubleSupport = d.doubleFPConfig() > 0, haveMask = _mask.kind() != _InputArray::NONE; if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || @@ -1991,13 +1992,14 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & if (depth != CV_8U && depth != CV_16U) { - int wdepth = std::max(CV_32S, depth); + int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1; char cvt[50]; ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc, - format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s -D convertToDT=%s%s", + format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s" + " -D convertToDT=%s -D rowsPerWI=%d%s", ocl::typeToStr(wdepth), ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, 1, cvt), + ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (kabs.empty()) return false; @@ -2005,7 +2007,7 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn)); kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn)); - size_t globalsize[2] = { src.cols * cn, src.rows }; + size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI }; if (!kabs.run(2, globalsize, NULL, false)) return false; } @@ -2016,8 +2018,8 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & } else { - int dbsize = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + int dbsize = d.maxComputeUnits(); + size_t wgs = d.maxWorkGroupSize(); int wgs2_aligned = 1; while (wgs2_aligned < (int)wgs) @@ -2384,8 +2386,9 @@ namespace cv { static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) { - int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device & d = ocl::Device::getDefault(); + int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1; + bool doubleSupport = d.doubleFPConfig() > 0; bool relative = (normType & NORM_RELATIVE) != 0; normType &= ~NORM_RELATIVE; @@ -2397,9 +2400,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr char cvt[50]; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1" - " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT%s", + " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s", ocl::typeToStr(wdepth), ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, 1, cvt), + ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -2408,7 +2411,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2), ocl::KernelArg::WriteOnly(diff, cn)); - size_t globalsize[2] = { diff.cols * cn, diff.rows }; + size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI }; if (!k.run(2, globalsize, NULL, false)) return false; -- 2.7.4