From: Ilya Lavrenov Date: Sat, 7 Jun 2014 11:51:41 +0000 (+0400) Subject: optimized cv::norm with 2 args X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~348^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2040995801b0c685c75c14fcd11cfaa974fc5b9c;p=profile%2Fivi%2Fopencv.git optimized cv::norm with 2 args --- diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 2e48387..e3d87b0 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -73,15 +73,27 @@ #define CALC_MAX(p, inc) #endif +#ifdef OP_CALC2 +#define CALC_MAX2(p) \ + if (maxval2 < temp.p) \ + maxval2 = temp.p +#else +#define CALC_MAX2(p) +#endif + #define CALC_P(p, inc) \ CALC_MIN(p, inc) \ - CALC_MAX(p, inc) + CALC_MAX(p, inc) \ + CALC_MAX2(p) __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols, int total, int groupnum, __global uchar * dstptr #ifdef HAVE_MASK , __global const uchar * mask, int mask_step, int mask_offset #endif +#ifdef HAVE_SRC2 + , __global const uchar * src2ptr, int src2_step, int src2_offset +#endif ) { int lid = get_local_id(0); @@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #ifdef HAVE_MASK mask += mask_offset; #endif +#ifdef HAVE_SRC2 + src2ptr += src2_offset; +#endif #ifdef NEED_MINVAL __local dstT1 localmem_min[WGS2_ALIGNED]; + dstT1 minval = MAX_VAL; #ifdef NEED_MINLOC __local uint localmem_minloc[WGS2_ALIGNED]; + uint minloc = INDEX_MAX; #endif #endif #ifdef NEED_MAXVAL + dstT1 maxval = MIN_VAL; __local dstT1 localmem_max[WGS2_ALIGNED]; #ifdef NEED_MAXLOC __local uint localmem_maxloc[WGS2_ALIGNED]; + uint maxloc = INDEX_MAX; +#endif #endif +#ifdef OP_CALC2 + __local dstT1 localmem_max2[WGS2_ALIGNED]; + dstT1 maxval2 = MIN_VAL; #endif - dstT1 minval = MAX_VAL, maxval = MIN_VAL; - dstT temp; - uint minloc = INDEX_MAX, maxloc = INDEX_MAX; int src_index; #ifdef HAVE_MASK int mask_index; #endif +#ifdef HAVE_SRC2 + int src2_index; +#endif - for (int grain = groupnum * WGS * kercn; id < total; id += grain) - { -#ifdef HAVE_SRC_CONT - src_index = mul24(id, (int)sizeof(srcT1)); -#else - src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); + dstT temp; +#ifdef HAVE_SRC2 + dstT temp2; #endif + for (int grain = groupnum * WGS * kercn; id < total; id += grain) + { #ifdef HAVE_MASK #ifdef HAVE_MASK_CONT mask_index = id; @@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off if (mask[mask_index]) #endif { +#ifdef HAVE_SRC_CONT + src_index = mul24(id, (int)sizeof(srcT1)); +#else + src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); +#endif temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); +#ifdef OP_ABS + temp = temp >= (dstT)(0) ? temp : -temp; +#endif + +#ifdef HAVE_SRC2 +#ifdef HAVE_SRC2_CONT + src2_index = mul24(id, (int)sizeof(srcT1)); +#else + src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1))); +#endif + temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index)); + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); +#endif + #if kercn == 1 #ifdef NEED_MINVAL if (minval > temp) @@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off maxloc = id; #endif } +#ifdef OP_CALC2 + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; + if (maxval2 < temp2) + maxval2 = temp2; +#endif #endif #elif kercn >= 2 CALC_P(s0, 0) @@ -192,6 +238,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #ifdef NEED_MAXLOC localmem_maxloc[lid] = maxloc; #endif +#ifdef OP_CALC2 + localmem_max2[lid] = maxval2; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -222,6 +271,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off localmem_max[lid3] = maxval; } #endif +#ifdef OP_CALC2 + if (localmem_max2[lid3] < maxval2) + localmem_max2[lid3] = maxval2; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -255,6 +308,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off localmem_max[lid] = localmem_max[lid2]; } #endif +#ifdef OP_CALC2 + if (localmem_max2[lid] < localmem_max2[lid2]) + localmem_max2[lid] = localmem_max2[lid2]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); } @@ -277,5 +334,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #ifdef NEED_MAXLOC *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; #endif +#ifdef OP_CALC2 + pos = mad24(groupnum, (int)sizeof(uint), pos); + *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0]; +#endif } } diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 21a5518..d535079 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -109,13 +109,22 @@ #endif #ifdef HAVE_MASK +#ifdef HAVE_SRC2 +#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset +#else #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset +#endif +#else +#ifdef HAVE_SRC2 +#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset #else #define EXTRA_PARAMS #endif +#endif // accumulative reduction stuff #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT + #ifdef OP_DOT #if ddepth <= 4 #define FUNC(a, b, c) a = mad24(b, c, a) @@ -137,18 +146,48 @@ #endif #endif +#ifdef OP_CALC2 +#define DECLARE_LOCAL_MEM \ + __local dstT localmem[WGS2_ALIGNED]; \ + __local dstT localmem2[WGS2_ALIGNED] +#define DEFINE_ACCUMULATOR \ + dstT accumulator = (dstT)(0); \ + dstT accumulator2 = (dstT)(0) +#else #define DECLARE_LOCAL_MEM \ __local dstT localmem[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ dstT accumulator = (dstT)(0) +#endif + +#ifdef HAVE_SRC2 +#ifdef OP_CALC2 +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \ + dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp -= temp2; \ + temp = temp > (dstT)(0) ? temp : -temp; \ + FUNC(accumulator2, temp2); \ + FUNC(accumulator, temp) +#else +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp) +#endif +#else +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp) +#endif #ifdef HAVE_MASK #define REDUCE_GLOBAL \ MASK_INDEX; \ if (mask[mask_index]) \ { \ - dstT temp = convertToDT(loadpix(srcptr + src_index)); \ - FUNC(accumulator, temp); \ + PROCESS_ELEMS; \ } #elif defined OP_DOT @@ -211,7 +250,158 @@ FUNC(accumulator, temp.sF, temp2.sF) #endif -#else +#else // sum or norm with 2 args +#ifdef HAVE_SRC2 +#ifdef OP_CALC2 // norm relative +#if kercn == 1 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp); \ + FUNC(accumulator2, temp2) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3); \ + FUNC(accumulator2, temp2.s4); \ + FUNC(accumulator2, temp2.s5); \ + FUNC(accumulator2, temp2.s6); \ + FUNC(accumulator2, temp2.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator, temp.s8); \ + FUNC(accumulator, temp.s9); \ + FUNC(accumulator, temp.sA); \ + FUNC(accumulator, temp.sB); \ + FUNC(accumulator, temp.sC); \ + FUNC(accumulator, temp.sD); \ + FUNC(accumulator, temp.sE); \ + FUNC(accumulator, temp.sF); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3); \ + FUNC(accumulator2, temp2.s4); \ + FUNC(accumulator2, temp2.s5); \ + FUNC(accumulator2, temp2.s6); \ + FUNC(accumulator2, temp2.s7); \ + FUNC(accumulator2, temp2.s8); \ + FUNC(accumulator2, temp2.s9); \ + FUNC(accumulator2, temp2.sA); \ + FUNC(accumulator2, temp2.sB); \ + FUNC(accumulator2, temp2.sC); \ + FUNC(accumulator2, temp2.sD); \ + FUNC(accumulator2, temp2.sE); \ + FUNC(accumulator2, temp2.sF) +#endif +#else // norm with 2 args +#if kercn == 1 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator, temp.s8); \ + FUNC(accumulator, temp.s9); \ + FUNC(accumulator, temp.sA); \ + FUNC(accumulator, temp.sB); \ + FUNC(accumulator, temp.sC); \ + FUNC(accumulator, temp.sD); \ + FUNC(accumulator, temp.sE); \ + FUNC(accumulator, temp.sF) +#endif +#endif + +#else // sum #if kercn == 1 #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ @@ -260,6 +450,7 @@ FUNC(accumulator, temp.sF) #endif #endif +#endif #define SET_LOCAL_1 \ localmem[lid] = accumulator @@ -325,6 +516,20 @@ accumulator += value.sF == zero ? zero : one #endif +#ifdef OP_CALC2 +#define SET_LOCAL_1 \ + localmem[lid] = accumulator; \ + localmem2[lid] = accumulator2; \ +#define REDUCE_LOCAL_1 \ + localmem[lid - WGS2_ALIGNED] += accumulator; \ + localmem2[lid - WGS2_ALIGNED] += accumulator2 +#define REDUCE_LOCAL_2 \ + localmem[lid] += localmem[lid2]; \ + localmem2[lid] += localmem2[lid2] +#define CALC_RESULT \ + storepix(localmem[0], dstptr + dstTSIZE * gid); \ + storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid)) +#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -333,6 +538,7 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) +#endif // norm (NORM_INF) with cn > 1 and mask #elif defined OP_NORM_INF_MASK @@ -385,6 +591,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset #else int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); #endif +#ifdef HAVE_SRC2 +#ifdef HAVE_SRC2_CONT + int src2_index = mul24(id, srcTSIZE); +#else + int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); +#endif +#endif REDUCE_GLOBAL; } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 8996c48..b405d6f 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -469,21 +469,25 @@ template Scalar ocl_part_sum(Mat m) enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 }; -static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() ) +static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(), + InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() ) { CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, - haveMask = _mask.kind() != _InputArray::NONE; + const ocl::Device & dev = ocl::Device::getDefault(); + bool doubleSupport = dev.doubleFPConfig() > 0, + haveMask = _mask.kind() != _InputArray::NONE, + haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1, mcn = std::max(cn, kercn); + CV_Assert(!haveSrc2 || _src2.type() == type); if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) return false; - int dbsize = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1); + size_t wgs = dev.maxWorkGroupSize(); int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), dtype = CV_MAKE_TYPE(ddepth, cn); @@ -497,7 +501,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; char cvt[40]; String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" - " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d", + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), ocl::typeToStr(ddepth), ddepth, cn, @@ -506,30 +510,49 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); + haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, + haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) return false; - UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat(); + UMat src = _src.getUMat(), src2 = _src2.getUMat(), + db(1, dbsize, dtype), mask = _mask.getUMat(); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), dbarg = ocl::KernelArg::PtrWriteOnly(db), - maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); + maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), + src2arg = ocl::KernelArg::ReadOnlyNoSize(src2); if (haveMask) - k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg); + { + if (haveSrc2) + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg); + else + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg); + } else - k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg); + { + if (haveSrc2) + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg); + else + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg); + } - size_t globalsize = dbsize * wgs; + size_t globalsize = ngroups * wgs; if (k.run(1, &globalsize, &wgs, false)) { typedef Scalar (*part_sum)(Mat m); part_sum funcs[3] = { ocl_part_sum, ocl_part_sum, ocl_part_sum }, func = funcs[ddepth - CV_32S]; - res = func(db.getMat(ACCESS_READ)); + + Mat mres = db.getMat(ACCESS_READ); + if (calc2) + const_cast(res2) = func(mres.colRange(dbsize, dbsize)); + + res = func(mres.colRange(0, dbsize)); return true; } return false; @@ -1396,18 +1419,21 @@ typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal, int *minLoc, int *maxLoc, int gropunum, int cols); static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, - int ddepth = -1, bool absValues = false) + int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false) { CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); const ocl::Device & dev = ocl::Device::getDefault(); - bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(); + bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), + haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); if (ddepth < 0) ddepth = depth; + CV_Assert(!haveSrc2 || _src2.type() == type); + if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) return false; @@ -1435,7 +1461,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* char cvt[40]; String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" - " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s", + " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s", depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", @@ -1444,7 +1470,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), - ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : ""); + ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "", + haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) @@ -1452,18 +1480,35 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + - (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0)); - UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); + (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) + + (calc2 ? esz : 0)); + UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); if (cn > 1) + { src = src.reshape(1); + src2 = src2.reshape(1); + } - if (!haveMask) - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - groupnum, ocl::KernelArg::PtrWriteOnly(db)); + if (haveSrc2) + { + if (!haveMask) + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2)); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask), + ocl::KernelArg::ReadOnlyNoSize(src2)); + } else - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); + { + if (!haveMask) + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db)); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); + } size_t globalsize = groupnum * wgs; if (!k.run(1, &globalsize, &wgs, false)) @@ -2498,38 +2543,45 @@ namespace cv { static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) { - const ocl::Device & d = ocl::Device::getDefault(); - int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1; - bool doubleSupport = d.doubleFPConfig() > 0; - bool relative = (normType & NORM_RELATIVE) != 0; + Scalar sc1, sc2; + int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + bool relative = (normType & NORM_RELATIVE) != 0, + normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; normType &= ~NORM_RELATIVE; - if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || - (!doubleSupport && depth == CV_64F)) + if ( !(normType == NORM_INF || normsum) ) return false; - int wdepth = std::max(CV_32S, depth); - char cvt[50]; - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1" - " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s", - ocl::typeToStr(wdepth), ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); - if (k.empty()) - return false; + if (normsum) + { + if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ? + OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2)) + return false; + } + else + { + if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth), + false, _src2, relative)) + return false; + } - UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn)); - k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2), - ocl::KernelArg::WriteOnly(diff, cn)); + double s2 = 0; + for (int i = 0; i < cn; ++i) + { + result += sc1[i]; + if (relative) + s2 += sc2[i]; + } - size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI }; - if (!k.run(2, globalsize, NULL, false)) - return false; + if (normType == NORM_L2) + { + result = std::sqrt(result); + if (relative) + s2 = std::sqrt(s2); + } - result = cv::norm(diff, normType, _mask); if (relative) - result /= cv::norm(src2, normType, _mask) + DBL_EPSILON; + result /= (s2 + DBL_EPSILON); return true; }