From 7f2662b310489d3336cadf46c386d271ebf09ae0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 00:50:14 +0400 Subject: [PATCH] fixes --- .../core/include/opencv2/core/opencl/ocl_defs.hpp | 2 +- modules/core/src/opencl/minmaxloc.cl | 40 +++++++++++------- modules/core/src/opencl/reduce.cl | 47 ++++++++++++---------- modules/core/src/stat.cpp | 19 ++++----- 4 files changed, 62 insertions(+), 46 deletions(-) diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp index 55f8849..76d4f84 100644 --- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp +++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp @@ -5,7 +5,7 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. -//#define CV_OPENCL_RUN_ASSERT +#define CV_OPENCL_RUN_ASSERT #ifdef HAVE_OPENCL diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 11b6da9..56de655 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -15,16 +15,16 @@ #ifdef DEPTH_0 #define MIN_VAL 0 -#define MAX_VAL 255 +#define MAX_VAL UCHAR_MAX #elif defined DEPTH_1 -#define MIN_VAL -128 -#define MAX_VAL 127 +#define MIN_VAL SCHAR_MIN +#define MAX_VAL SCHAR_MAX #elif defined DEPTH_2 #define MIN_VAL 0 -#define MAX_VAL 65535 +#define MAX_VAL USHRT_MAX #elif defined DEPTH_3 -#define MIN_VAL -32768 -#define MAX_VAL 32767 +#define MIN_VAL SHRT_MIN +#define MAX_VAL SHRT_MAX #elif defined DEPTH_4 #define MIN_VAL INT_MIN #define MAX_VAL INT_MAX @@ -39,6 +39,14 @@ #define noconvert #define INDEX_MAX UINT_MAX +#if kercn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define srcTSIZE (int)sizeof(srcT1) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define srcTSIZE ((int)sizeof(srcT1)) +#endif + #ifdef NEED_MINLOC #define CALC_MINLOC(inc) minloc = id + inc #else @@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif { #ifdef HAVE_SRC_CONT - src_index = mul24(id, (int)sizeof(srcT1)); + src_index = mul24(id, srcTSIZE); #else - src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); + src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); #endif - temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); + temp = convertToDT(loadpix(srcptr + src_index)); #ifdef OP_ABS temp = temp >= (dstT)(0) ? temp : -temp; #endif #ifdef HAVE_SRC2 #ifdef HAVE_SRC2_CONT - src2_index = mul24(id, (int)sizeof(srcT1)); + src2_index = mul24(id, srcTSIZE); #else - src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1))); + src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); #endif - temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index)); + temp2 = convertToDT(loadpix(src2ptr + src2_index)); temp = temp > temp2 ? temp - temp2 : (temp2 - temp); #ifdef OP_CALC2 temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; @@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #elif kercn >= 2 CALC_P(s0, 0) CALC_P(s1, 1) -#if kercn >= 4 +#if kercn >= 3 CALC_P(s2, 2) +#if kercn >= 4 CALC_P(s3, 3) #if kercn >= 8 CALC_P(s4, 4) @@ -223,6 +232,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #endif #endif +#endif } } @@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #ifdef NEED_MAXLOC *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; -#endif #ifdef OP_CALC2 pos = mad24(groupnum, (int)sizeof(uint), pos); +#endif +#endif +#ifdef OP_CALC2 *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0]; #endif } diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 92818e3..9418cec 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -148,11 +148,9 @@ #ifdef OP_CALC2 #define DECLARE_LOCAL_MEM \ - __local dstT localmem[WGS2_ALIGNED]; \ - __local dstT localmem2[WGS2_ALIGNED] + __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ - dstT accumulator = (dstT)(0); \ - dstT accumulator2 = (dstT)(0) + dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0) #else #define DECLARE_LOCAL_MEM \ __local dstT localmem[WGS2_ALIGNED] @@ -163,10 +161,10 @@ #ifdef HAVE_SRC2 #ifdef OP_CALC2 #define PROCESS_ELEMS \ - dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp -= temp2; \ - temp = temp > (dstT)(0) ? temp : -temp; \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator2, temp2); \ FUNC(accumulator, temp) #else @@ -258,6 +256,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp); \ FUNC(accumulator2, temp2) #elif kercn == 2 @@ -265,6 +264,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator2, temp2.s0); \ @@ -274,6 +274,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -287,6 +288,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -308,6 +310,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -452,6 +455,20 @@ #endif #endif +#ifdef OP_CALC2 +#define SET_LOCAL_1 \ + localmem[lid] = accumulator; \ + localmem2[lid] = accumulator2 +#define REDUCE_LOCAL_1 \ + localmem[lid - WGS2_ALIGNED] += accumulator; \ + localmem2[lid - WGS2_ALIGNED] += accumulator2 +#define REDUCE_LOCAL_2 \ + localmem[lid] += localmem[lid2]; \ + localmem2[lid] += localmem2[lid2] +#define CALC_RESULT \ + storepix(localmem[0], dstptr + dstTSIZE * gid); \ + storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid)) +#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -460,6 +477,7 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) +#endif // countNonZero stuff #elif defined OP_COUNT_NON_ZERO @@ -516,20 +534,6 @@ accumulator += value.sF == zero ? zero : one #endif -#ifdef OP_CALC2 -#define SET_LOCAL_1 \ - localmem[lid] = accumulator; \ - localmem2[lid] = accumulator2; \ -#define REDUCE_LOCAL_1 \ - localmem[lid - WGS2_ALIGNED] += accumulator; \ - localmem2[lid - WGS2_ALIGNED] += accumulator2 -#define REDUCE_LOCAL_2 \ - localmem[lid] += localmem[lid2]; \ - localmem2[lid] += localmem2[lid2] -#define CALC_RESULT \ - storepix(localmem[0], dstptr + dstTSIZE * gid); \ - storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid)) -#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -538,7 +542,6 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) -#endif // norm (NORM_INF) with cn > 1 and mask #elif defined OP_NORM_INF_MASK diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 34c487a..01f50fa 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask Mat mres = db.getMat(ACCESS_READ); if (calc2) - const_cast(res2) = func(mres.colRange(dbsize, dbsize)); + const_cast(res2) = func(mres.colRange(ngroups, dbsize)); - res = func(mres.colRange(0, dbsize)); + res = func(mres.colRange(0, ngroups)); return true; } return false; @@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), - kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); + kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src)); - CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) || - (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) ); + CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) || + (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) ); if (ddepth < 0) ddepth = depth; @@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); + printf("%s\n", opts.c_str()); + ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) return false; @@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr { Scalar sc1, sc2; int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool relative = (normType & NORM_RELATIVE) != 0, - normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; + bool relative = (normType & NORM_RELATIVE) != 0; normType &= ~NORM_RELATIVE; + bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; if ( !(normType == NORM_INF || normsum) ) return false; @@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m #ifdef HAVE_OPENCL double _result = 0; - CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() && - _src1.dims() <= 2 && _src2.dims() <= 2, + CV_OCL_RUN_(_src1.isUMat(), ocl_norm(_src1, _src2, normType, _mask, _result), _result) #endif -- 2.7.4