From fe38aab84abe57988bff7a5d6a2699c4d731c8ab Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 25 Feb 2014 00:29:17 +0400 Subject: [PATCH] core tapi optimization --- .../core/include/opencv2/core/opencl/ocl_defs.hpp | 2 +- modules/core/src/arithm.cpp | 4 +- modules/core/src/convert.cpp | 15 +-- modules/core/src/mathfuncs.cpp | 8 +- modules/core/src/matmul.cpp | 4 +- modules/core/src/opencl/arithm.cl | 114 +++++++++++++-------- modules/core/src/opencl/convert.cl | 8 +- modules/core/src/opencl/copymakeborder.cl | 14 +-- modules/core/src/opencl/copyset.cl | 14 +-- modules/core/src/opencl/flip.cl | 26 ++--- modules/core/src/opencl/inrange.cl | 9 +- modules/core/src/opencl/lut.cl | 14 +-- modules/core/src/opencl/mixchannels.cl | 4 +- modules/core/src/opencl/mulspectrums.cl | 8 +- modules/core/src/opencl/reduce.cl | 33 ++++-- modules/core/src/opencl/reduce2.cl | 2 +- modules/core/src/opencl/set_identity.cl | 2 +- modules/core/src/opencl/split_merge.cl | 8 +- modules/core/src/opencl/transpose.cl | 22 ++-- modules/core/src/stat.cpp | 4 +- modules/core/src/umatrix.cpp | 22 ++-- 21 files changed, 191 insertions(+), 146 deletions(-) diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp index 4acfa7a..55f8849 100644 --- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp +++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp @@ -5,7 +5,7 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. -//#define CV_OPENCL_RUN_VERBOSE +//#define CV_OPENCL_RUN_ASSERT #ifdef HAVE_OPENCL diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index a837452..623f9d0 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1318,7 +1318,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, char cvtstr[4][32], opts[1024]; sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " - "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s " + "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), @@ -1329,7 +1329,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl::typeToStr(CV_MAKETYPE(ddepth, 1)), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), - ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), + ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), wdepth, ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index e64d099..c314823 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1320,8 +1320,8 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha int wdepth = std::max(depth, CV_32F); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=uchar -D srcT1=%s" - " -D workT=%s -D convertToWT1=%s -D convertToDT=%s%s", - ocl::typeToStr(depth), ocl::typeToStr(wdepth), + " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s%s", + ocl::typeToStr(depth), ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, 1, cvt[0]), ocl::convertTypeStr(wdepth, CV_8U, 1, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); @@ -1492,19 +1492,14 @@ static LUTFunc lutTab[] = static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) { int dtype = _dst.type(), lcn = _lut.channels(), dcn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - - if (_src.dims() > 2 || (!doubleSupport && ddepth == CV_64F)) - return false; UMat src = _src.getUMat(), lut = _lut.getUMat(); _dst.create(src.size(), dtype); UMat dst = _dst.getUMat(); ocl::Kernel k("LUT", ocl::core::lut_oclsrc, - format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn, - ocl::typeToStr(src.depth()), ocl::typeToStr(ddepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn, + ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth))); if (k.empty()) return false; @@ -1528,7 +1523,7 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst ) _lut.total() == 256 && _lut.isContinuous() && (depth == CV_8U || depth == CV_8S) ); - CV_OCL_RUN(_dst.isUMat(), + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, ocl_LUT(_src, _lut, _dst)) Mat src = _src.getMat(), lut = _lut.getMat(); diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index f81e835..d7fad62 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -508,9 +508,9 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2, return false; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D dstT=%s -D OP_CTP_%s%s", + format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), - angleInDegrees ? "AD" : "AR", + depth, angleInDegrees ? "AD" : "AR", doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -695,8 +695,8 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle, return false; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D dstT=%s -D BINARY_OP -D OP_PTC_%s%s", - ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), + format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s", + ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth, angleInDegrees ? "AD" : "AR", doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index c6dde65..022f88e 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2166,9 +2166,9 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp char cvt[2][50]; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" + format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D wdepth=%d -D convertToWT1=%s" " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s%s", ocl::typeToStr(depth), - ocl::typeToStr(wdepth), ocl::convertTypeStr(depth, wdepth, 1, cvt[0]), + ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, 1, cvt[0]), ocl::convertTypeStr(wdepth, depth, 1, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index a7dacc4..bac72d3 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -63,11 +63,12 @@ #elif defined cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif -#define CV_EPSILON DBL_EPSILON -#define CV_PI M_PI -#else -#define CV_EPSILON FLT_EPSILON +#endif + +#if depth <= 5 #define CV_PI M_PI_F +#else +#define CV_PI M_PI #endif #ifndef cn @@ -84,11 +85,11 @@ #endif #if cn != 3 - #define storedst(val) *(__global dstT*)(dstptr + dst_index) = val - #define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val + #define storedst(val) *(__global dstT *)(dstptr + dst_index) = val + #define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val #else - #define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index)) - #define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2)) + #define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index)) + #define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2)) #endif #define noconvert @@ -97,19 +98,27 @@ #ifndef srcT1 #define srcT1 dstT + #endif + + #ifndef srcT1_C1 #define srcT1_C1 dstT_C1 #endif + #ifndef srcT2 #define srcT2 dstT + #endif + + #ifndef srcT2_C1 #define srcT2_C1 dstT_C1 #endif + #define workT dstT #if cn != 3 - #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index) - #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index) + #define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index) + #define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index) #else - #define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)) - #define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)) + #define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)) + #define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)) #endif #ifndef convertToDT #define convertToDT noconvert @@ -121,11 +130,11 @@ #define convertToWT2 convertToWT1 #endif #if cn != 3 - #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index)) - #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index)) + #define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index)) + #define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index)) #else - #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))) - #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))) + #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))) + #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))) #endif #endif @@ -224,7 +233,11 @@ #elif defined OP_ADDW #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma -#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma)) +#if wdepth <= 4 +#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma)))) +#else +#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, mad(srcelem2, beta, gamma)))) +#endif #elif defined OP_MAG #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2)) @@ -274,16 +287,31 @@ #elif defined OP_CONVERT_SCALE_ABS #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha, workT beta +#if wdepth <= 4 #define PROCESS_ELEM \ - workT value = srcelem1 * alpha + beta; \ + workT value = mad24(srcelem1, alpha, beta); \ storedst(convertToDT(value >= 0 ? value : -value)) +#else +#define PROCESS_ELEM \ + workT value = mad(srcelem1, alpha, beta); \ + storedst(convertToDT(value >= 0 ? value : -value)) +#endif #elif defined OP_SCALE_ADD #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha -#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2)) +#if wdepth <= 4 +#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, srcelem2))) +#else +#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, srcelem2))) +#endif #elif defined OP_CTP_AD || defined OP_CTP_AR +#if depth <= 5 +#define CV_EPSILON FLT_EPSILON +#else +#define CV_EPSILON DBL_EPSILON +#endif #ifdef OP_CTP_AD #define TO_DEGREE cartToPolar *= (180 / CV_PI); #elif defined OP_CTP_AR @@ -296,7 +324,7 @@ dstT tmp = y >= 0 ? 0 : CV_PI * 2; \ tmp = x < 0 ? CV_PI : tmp; \ dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \ - dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \ + dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \ TO_DEGREE \ storedst(magnitude); \ storedst2(cartToPolar) @@ -331,7 +359,7 @@ #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 #undef EXTRA_INDEX - #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2) + #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2)) #endif #if defined UNARY_OP || defined MASK_UNARY_OP @@ -355,9 +383,9 @@ #if defined BINARY_OP -__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, - __global const uchar* srcptr2, int srcstep2, int srcoffset2, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, + __global const uchar * srcptr2, int srcstep2, int srcoffset2, + __global uchar * dstptr, int dststep, int dstoffset, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); @@ -365,11 +393,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); + int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); #endif - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); EXTRA_INDEX; PROCESS_ELEM; @@ -378,10 +406,10 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, #elif defined MASK_BINARY_OP -__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, - __global const uchar* srcptr2, int srcstep2, int srcoffset2, - __global const uchar* mask, int maskstep, int maskoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, + __global const uchar * srcptr2, int srcstep2, int srcoffset2, + __global const uchar * mask, int maskstep, int maskoffset, + __global uchar * dstptr, int dststep, int dstoffset, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); @@ -392,9 +420,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); + int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2)); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); PROCESS_ELEM; } @@ -403,8 +431,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, #elif defined UNARY_OP -__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, + __global uchar * dstptr, int dststep, int dstoffset, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); @@ -412,8 +440,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); + int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); PROCESS_ELEM; } @@ -421,9 +449,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, #elif defined MASK_UNARY_OP -__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, - __global const uchar* mask, int maskstep, int maskoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1, + __global const uchar * mask, int maskstep, int maskoffset, + __global uchar * dstptr, int dststep, int dstoffset, int rows, int cols EXTRA_PARAMS ) { int x = get_global_id(0); @@ -434,8 +462,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); + int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1)); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset)); PROCESS_ELEM; } diff --git a/modules/core/src/opencl/convert.cl b/modules/core/src/opencl/convert.cl index 6c2d16c..b801409 100644 --- a/modules/core/src/opencl/convert.cl +++ b/modules/core/src/opencl/convert.cl @@ -53,19 +53,19 @@ __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - float alpha, float beta ) + WT alpha, WT beta) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(srcT) ); - int dst_index = mad24(y, dst_step, dst_offset + x * (int)sizeof(dstT) ); + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT), src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset)); __global const srcT * src = (__global const srcT *)(srcptr + src_index); __global dstT * dst = (__global dstT *)(dstptr + dst_index); - dst[0] = convertToDT( src[0] * alpha + beta ); + dst[0] = convertToDT(mad(convertToWT(src[0]), alpha, beta)); } } diff --git a/modules/core/src/opencl/copymakeborder.cl b/modules/core/src/opencl/copymakeborder.cl index bb26442..dbb00b9 100644 --- a/modules/core/src/opencl/copymakeborder.cl +++ b/modules/core/src/opencl/copymakeborder.cl @@ -47,9 +47,9 @@ #elif defined BORDER_REPLICATE #define EXTRAPOLATE(x, y, v) \ { \ - x = max(min(x, src_cols - 1), 0); \ - y = max(min(y, src_rows - 1), 0); \ - v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \ + x = clamp(x, 0, src_cols - 1); \ + y = clamp(y, 0, src_rows - 1); \ + v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \ } #elif defined BORDER_WRAP #define EXTRAPOLATE(x, y, v) \ @@ -63,7 +63,7 @@ y -= ((y - src_rows + 1) / src_rows) * src_rows; \ if( y >= src_rows ) \ y %= src_rows; \ - v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \ + v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \ } #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) #ifdef BORDER_REFLECT @@ -97,7 +97,7 @@ y = src_rows - 1 - (y - src_rows) - delta; \ } \ while (y >= src_rows || y < 0); \ - v = *(__global const T *)(srcptr + mad24(y, src_step, x * (int)sizeof(T) + src_offset)); \ + v = *(__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); \ } #else #error No extrapolation method @@ -117,14 +117,14 @@ __kernel void copyMakeBorder(__global const uchar * srcptr, int src_step, int sr int src_x = x - left; int src_y = y - top; - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T), dst_offset)); __global T * dst = (__global T *)(dstptr + dst_index); if (NEED_EXTRAPOLATION(src_x, src_y)) EXTRAPOLATE(src_x, src_y, dst[0]) else { - int src_index = mad24(src_y, src_step, src_x * (int)sizeof(T) + src_offset); + int src_index = mad24(src_y, src_step, mad24(src_x, (int)sizeof(T), src_offset)); __global const T * src = (__global const T *)(srcptr + src_index); dst[0] = src[0]; } diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl index cbafe67..42796ea 100644 --- a/modules/core/src/opencl/copyset.cl +++ b/modules/core/src/opencl/copyset.cl @@ -44,8 +44,8 @@ #ifdef COPY_TO_MASK #define DEFINE_DATA \ - int src_index = mad24(y, src_step, x*(int)sizeof(T)*scn + src_offset); \ - int dst_index = mad24(y, dst_step, x*(int)sizeof(T)*scn + dst_offset); \ + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T) * scn, src_offset)); \ + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T) * scn, dst_offset)); \ \ __global const T * src = (__global const T *)(srcptr + src_index); \ __global T * dst = (__global T *)(dstptr + dst_index) @@ -60,7 +60,7 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of if (x < dst_cols && y < dst_rows) { - int mask_index = mad24(y, mask_step, x * mcn + mask_offset); + int mask_index = mad24(y, mask_step, mad24(x, mcn, mask_offset)); __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); #if mcn == 1 @@ -93,10 +93,10 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of #if cn != 3 #define value value_ -#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val +#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val #else #define value (dstT)(value_.x, value_.y, value_.z) -#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index)) +#define storedst(val) vstore3(val, 0, (__global dstT1 *)(dstptr + dst_index)) #endif __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, @@ -111,7 +111,7 @@ __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset)); storedst(value); } } @@ -125,7 +125,7 @@ __kernel void set(__global uchar* dstptr, int dststep, int dstoffset, if (x < cols && y < rows) { - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset); + int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset)); storedst(value); } } diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index 4e53041..0c874db 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -50,11 +50,11 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr if (x < cols && y < thread_rows) { - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, srcoffset + x * sizeoftype)); - __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, srcoffset + x * sizeoftype)); + __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); + __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset))); - __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, dstoffset + x * sizeoftype)); - __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, dstoffset + x * sizeoftype)); + __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); + __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset))); dst0[0] = src1[0]; dst1[0] = src0[0]; @@ -70,11 +70,12 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i if (x < cols && y < thread_rows) { - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, x*sizeoftype + srcoffset)); - __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, (cols - x - 1)*sizeoftype + srcoffset)); + int x1 = cols - x - 1; + __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); + __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset))); - __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, (cols - x - 1)*sizeoftype + dstoffset)); - __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, x * sizeoftype + dstoffset)); + __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset))); + __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); dst0[0] = src0[0]; dst1[0] = src1[0]; @@ -90,11 +91,12 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr if (x < thread_cols && y < rows) { - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, x * sizeoftype + srcoffset)); - __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, (cols - x - 1)*sizeoftype + srcoffset)); + int x1 = cols - x - 1; + __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); + __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset))); - __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, (cols - x - 1)*sizeoftype + dstoffset)); - __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, x * sizeoftype + dstoffset)); + __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset))); + __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); dst1[0] = src1[0]; dst0[0] = src0[0]; diff --git a/modules/core/src/opencl/inrange.cl b/modules/core/src/opencl/inrange.cl index 7549cf3..b113859 100644 --- a/modules/core/src/opencl/inrange.cl +++ b/modules/core/src/opencl/inrange.cl @@ -64,23 +64,22 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of if (x < dst_cols && y < dst_rows) { - int src1_index = mad24(y, src1_step, x*(int)sizeof(T)*cn + src1_offset); + int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset)); int dst_index = mad24(y, dst_step, x + dst_offset); __global const T * src1 = (__global const T *)(src1ptr + src1_index); __global uchar * dst = dstptr + dst_index; #ifndef HAVE_SCALAR - int src2_index = mad24(y, src2_step, x*(int)sizeof(T)*cn + src2_offset); - int src3_index = mad24(y, src3_step, x*(int)sizeof(T)*cn + src3_offset); + int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset)); + int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset)); __global const T * src2 = (__global const T *)(src2ptr + src2_index); __global const T * src3 = (__global const T *)(src3ptr + src3_index); #endif dst[0] = 255; - #pragma unroll for (int c = 0; c < cn; ++c) - if ( src2[c] > src1[c] || src3[c] < src1[c] ) + if (src2[c] > src1[c] || src3[c] < src1[c]) { dst[0] = 0; break; diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl index 8d58742..da92c2f 100644 --- a/modules/core/src/opencl/lut.cl +++ b/modules/core/src/opencl/lut.cl @@ -34,14 +34,6 @@ // // -#ifdef DOUBLE_SUPPORT -#ifdef cl_amd_fp64 -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (cl_khr_fp64) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -#endif - __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset, __global const uchar * lutptr, int lut_step, int lut_offset, __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols) @@ -51,8 +43,8 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset, if (x < cols && y < rows) { - int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(srcT) * dcn); - int dst_index = mad24(y, dst_step, dst_offset + x * (int)sizeof(dstT) * dcn); + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset)); __global const srcT * src = (__global const srcT *)(srcptr + src_index); __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset); @@ -65,7 +57,7 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset, #else #pragma unroll for (int cn = 0; cn < dcn; ++cn) - dst[cn] = lut[src[cn] * dcn + cn]; + dst[cn] = lut[mad24(src[cn], dcn, cn)]; #endif } } diff --git a/modules/core/src/opencl/mixchannels.cl b/modules/core/src/opencl/mixchannels.cl index 7abd60a..bede20c 100644 --- a/modules/core/src/opencl/mixchannels.cl +++ b/modules/core/src/opencl/mixchannels.cl @@ -46,9 +46,9 @@ #define DECLARE_OUTPUT_MAT(i) \ __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset, #define PROCESS_ELEM(i) \ - int src##i##_index = mad24(src##i##_step, y, x * (int)sizeof(T) * scn##i + src##i##_offset); \ + int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \ __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \ - int dst##i##_index = mad24(dst##i##_step, y, x * (int)sizeof(T) * dcn##i + dst##i##_offset); \ + int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \ __global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \ dst##i[0] = src##i[0]; diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl index 248ff00..817331e 100644 --- a/modules/core/src/opencl/mulspectrums.cl +++ b/modules/core/src/opencl/mulspectrums.cl @@ -45,7 +45,7 @@ inline float2 cmulf(float2 a, float2 b) { - return (float2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); + return (float2)(mad(a.x, b.x, - a.y * b.y), mad(a.x, b.y, a.y * b.x)); } inline float2 conjf(float2 a) @@ -63,9 +63,9 @@ __kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step if (x < dst_cols && y < dst_rows) { - int src1_index = mad24(y, src1_step, x * (int)sizeof(float2) + src1_offset); - int src2_index = mad24(y, src2_step, x * (int)sizeof(float2) + src2_offset); - int dst_index = mad24(y, dst_step, x * (int)sizeof(float2) + dst_offset); + int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset)); + int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset)); + int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset)); float2 src0 = *(__global const float2 *)(src1ptr + src1_index); float2 src1 = *(__global const float2 *)(src2ptr + src2_index); diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index febc1cb..0a0538e 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -58,20 +58,34 @@ #define EXTRA_PARAMS #endif +// accumulative reduction stuff #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT #ifdef OP_DOT -#define FUNC(a, b, c) a += b * c +#if ddepth <= 4 +#define FUNC(a, b, c) a = mad24(b, c, a) +#else +#define FUNC(a, b, c) a = mad(b, c, a) +#endif + #elif defined OP_SUM #define FUNC(a, b) a += b + #elif defined OP_SUM_ABS #define FUNC(a, b) a += b >= (dstT)(0) ? b : -b + #elif defined OP_SUM_SQR -#define FUNC(a, b) a += b * b +#if ddepth <= 4 +#define FUNC(a, b) a = mad24(b, b, a) +#else +#define FUNC(a, b) a = mad(b, b, a) +#endif #endif + #define DECLARE_LOCAL_MEM \ __local dstT localmem[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ dstT accumulator = (dstT)(0) + #ifdef HAVE_MASK #define REDUCE_GLOBAL \ dstT temp = convertToDT(src[0]); \ @@ -80,7 +94,7 @@ FUNC(accumulator, temp) #elif defined OP_DOT #define REDUCE_GLOBAL \ - int src2_index = mad24(id / cols, src2_step, src2_offset + (id % cols) * (int)sizeof(srcT)); \ + int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \ __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \ dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \ FUNC(accumulator, temp, temp2) @@ -89,6 +103,7 @@ dstT temp = convertToDT(src[0]); \ FUNC(accumulator, temp) #endif + #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -99,6 +114,7 @@ __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ dst[0] = localmem[0] +// countNonZero stuff #elif defined OP_COUNT_NON_ZERO #define dstT int #define DECLARE_LOCAL_MEM \ @@ -118,6 +134,7 @@ __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ dst[0] = localmem[0] +// minMaxLoc stuff #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK #ifdef DEPTH_0 @@ -179,8 +196,8 @@ #define REDUCE_LOCAL_1 \ srcT oldmin = localmem_min[lid-WGS2_ALIGNED]; \ srcT oldmax = localmem_max[lid-WGS2_ALIGNED]; \ - localmem_min[lid - WGS2_ALIGNED] = min(minval,localmem_min[lid-WGS2_ALIGNED]); \ - localmem_max[lid - WGS2_ALIGNED] = max(maxval,localmem_max[lid-WGS2_ALIGNED]); \ + localmem_min[lid - WGS2_ALIGNED] = min(minval, localmem_min[lid-WGS2_ALIGNED]); \ + localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid-WGS2_ALIGNED]); \ srcT minv = localmem_min[lid - WGS2_ALIGNED], maxv = localmem_max[lid - WGS2_ALIGNED]; \ localmem_minloc[lid - WGS2_ALIGNED] = (minv == minval) ? (minv == oldmin) ? \ min(minloc, localmem_minloc[lid-WGS2_ALIGNED]) : minloc : localmem_minloc[lid-WGS2_ALIGNED]; \ @@ -233,15 +250,17 @@ #else #error "No operation" -#endif +#endif // end of minMaxLoc stuff #ifdef OP_MIN_MAX_LOC #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2 + #elif defined OP_MIN_MAX_LOC_MASK #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2, \ __global const uchar * maskptr, int mask_step, int mask_offset + #elif defined OP_DOT #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset @@ -259,7 +278,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset for (int grain = groupnum * WGS; id < total; id += grain) { - int src_index = mad24(id / cols, src_step, src_offset + (id % cols) * (int)sizeof(srcT)); + int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset)); __global const srcT * src = (__global const srcT *)(srcptr + src_index); REDUCE_GLOBAL; } diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl index f8ff6a2..ef6a860 100644 --- a/modules/core/src/opencl/reduce2.cl +++ b/modules/core/src/opencl/reduce2.cl @@ -98,7 +98,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset int x = get_global_id(0); if (x < cols) { - int src_index = x * (int)sizeof(srcT) * cn + src_offset; + int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset); __global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn; dstT tmp[cn] = { INIT_VALUE }; diff --git a/modules/core/src/opencl/set_identity.cl b/modules/core/src/opencl/set_identity.cl index de8caaf..d63ce79 100644 --- a/modules/core/src/opencl/set_identity.cl +++ b/modules/core/src/opencl/set_identity.cl @@ -51,7 +51,7 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, if (x < cols && y < rows) { - int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(T)); + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); __global T * src = (__global T *)(srcptr + src_index); src[0] = x == y ? scalar : (T)(0); diff --git a/modules/core/src/opencl/split_merge.cl b/modules/core/src/opencl/split_merge.cl index d246275..8a1bc49 100644 --- a/modules/core/src/opencl/split_merge.cl +++ b/modules/core/src/opencl/split_merge.cl @@ -45,7 +45,7 @@ #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset, #define DECLARE_DATA(index) __global const T * src##index = \ - (__global T *)(src##index##ptr + mad24(src##index##_step, y, x * (int)sizeof(T) + src##index##_offset)); + (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T), src##index##_offset))); #define PROCESS_ELEM(index) dst[index] = src##index[0]; __kernel void merge(DECLARE_SRC_PARAMS_N @@ -58,7 +58,7 @@ __kernel void merge(DECLARE_SRC_PARAMS_N if (x < cols && y < rows) { DECLARE_DATA_N - __global T * dst = (__global T *)(dstptr + mad24(dst_step, y, x * (int)sizeof(T) * cn + dst_offset)); + __global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset))); PROCESS_ELEMS_N } } @@ -67,7 +67,7 @@ __kernel void merge(DECLARE_SRC_PARAMS_N #define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset #define DECLARE_DATA(index) __global T * dst##index = \ - (__global T *)(dst##index##ptr + mad24(y, dst##index##_step, x * (int)sizeof(T) + dst##index##_offset)); + (__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset))); #define PROCESS_ELEM(index) dst##index[0] = src[index]; __kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS) @@ -78,7 +78,7 @@ __kernel void split(__global uchar* srcptr, int src_step, int src_offset, int ro if (x < cols && y < rows) { DECLARE_DATA_N - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, x * cn * (int)sizeof(T) + src_offset)); + __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset))); PROCESS_ELEMS_N } } diff --git a/modules/core/src/opencl/transpose.cl b/modules/core/src/opencl/transpose.cl index da9608c..575cdab 100644 --- a/modules/core/src/opencl/transpose.cl +++ b/modules/core/src/opencl/transpose.cl @@ -60,7 +60,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off } else { - int bid = gp_x + gs_x * gp_y; + int bid = mad24(gs_x, gp_y, gp_x); groupId_y = bid % gs_y; groupId_x = ((bid / gs_y) + groupId_y) % gs_x; } @@ -68,23 +68,23 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off int lx = get_local_id(0); int ly = get_local_id(1); - int x = groupId_x * TILE_DIM + lx; - int y = groupId_y * TILE_DIM + ly; + int x = mad24(groupId_x, TILE_DIM, lx); + int y = mad24(groupId_y, TILE_DIM, ly); - int x_index = groupId_y * TILE_DIM + lx; - int y_index = groupId_x * TILE_DIM + ly; + int x_index = mad24(groupId_y, TILE_DIM, lx); + int y_index = mad24(groupId_x, TILE_DIM, ly); __local T title[TILE_DIM * LDS_STEP]; if (x < src_cols && y < src_rows) { - int index_src = mad24(y, src_step, x * (int)sizeof(T) + src_offset); + int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) if (y + i < src_rows) { __global const T * src = (__global const T *)(srcptr + index_src); - title[(ly + i) * LDS_STEP + lx] = src[0]; + title[mad24(ly + i, LDS_STEP, lx)] = src[0]; index_src = mad24(BLOCK_ROWS, src_step, index_src); } } @@ -92,13 +92,13 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off if (x_index < src_rows && y_index < src_cols) { - int index_dst = mad24(y_index, dst_step, x_index * (int)sizeof(T) + dst_offset); + int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset)); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) if ((y_index + i) < src_cols) { __global T * dst = (__global T *)(dstptr + index_dst); - dst[0] = title[lx * LDS_STEP + ly + i]; + dst[0] = title[mad24(lx, LDS_STEP, ly + i)]; index_dst = mad24(BLOCK_ROWS, dst_step, index_dst); } } @@ -111,8 +111,8 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o if (y < src_rows && x < y) { - int src_index = mad24(y, src_step, src_offset + x * (int)sizeof(T)); - int dst_index = mad24(x, src_step, src_offset + y * (int)sizeof(T)); + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); + int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset)); __global T * src = (__global T *)(srcptr + src_index); __global T * dst = (__global T *)(srcptr + dst_index); diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 2830bd1..ad07f93 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -494,8 +494,8 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", - ocl::typeToStr(type), ocl::typeToStr(dtype), ocl::convertTypeStr(depth, ddepth, cn, cvt), + format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", + ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt), opMap[sum_op], (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : "")); diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 578fefb..7ace751 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -719,10 +719,14 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() && ((needDouble && doubleSupport) || !needDouble) ) { - char cvt[40]; + int wdepth = std::max(CV_32F, sdepth); + + char cvt[2][40]; ocl::Kernel k("convertTo", ocl::core::convert_oclsrc, - format("-D srcT=%s -D dstT=%s -D convertToDT=%s%s", ocl::typeToStr(sdepth), - ocl::typeToStr(ddepth), ocl::convertTypeStr(CV_32F, ddepth, 1, cvt), + format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s", + ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth), + ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (!k.empty()) { @@ -731,7 +735,13 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con UMat dst = _dst.getUMat(); float alphaf = (float)alpha, betaf = (float)beta; - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf); + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), + dstarg = ocl::KernelArg::WriteOnly(dst, cn); + + if (wdepth == CV_32F) + k.args(srcarg, dstarg, alphaf, betaf); + else + k.args(srcarg, dstarg, alpha, beta); size_t globalsize[2] = { dst.cols * cn, dst.rows }; if (k.run(2, globalsize, NULL, false)) @@ -838,8 +848,8 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s", - ocl::typeToStr(depth), ocl::typeToStr(ddepth), ocl::convertTypeStr(depth, ddepth, 1, cvt), + format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s", + ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt), (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; -- 2.7.4