From 002a79bfc4522ef8e7466b5a4fe8ad8dbc79eb11 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 28 May 2014 18:39:02 +0400 Subject: [PATCH] optimized cv::countNonZero --- modules/core/src/opencl/reduce.cl | 40 ++++++++++++++++++++++++++++++++++++--- modules/core/src/stat.cpp | 9 +++++---- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 8fc2330..e24c82a 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -82,6 +82,10 @@ #define noconvert +#ifndef kercn +#define kercn 1 +#endif + #ifdef HAVE_MASK_CONT #define MASK_INDEX int mask_index = id + mask_offset; #else @@ -176,9 +180,39 @@ __local dstT localmem[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ dstT accumulator = (dstT)(0); \ - srcT zero = (srcT)(0), one = (srcT)(1) + srcT1 zero = (srcT1)(0), one = (srcT1)(1) +#if kercn == 1 #define REDUCE_GLOBAL \ accumulator += loadpix(srcptr + src_index) == zero ? zero : one +#elif kercn == 4 +#define REDUCE_GLOBAL \ + srcT value = loadpix(srcptr + src_index); \ + accumulator += value.s0 == zero ? zero : one; \ + accumulator += value.s1 == zero ? zero : one; \ + accumulator += value.s2 == zero ? zero : one; \ + accumulator += value.s3 == zero ? zero : one +#elif kercn == 16 +#define REDUCE_GLOBAL \ + srcT value = loadpix(srcptr + src_index); \ + accumulator += value.s0 == zero ? zero : one; \ + accumulator += value.s1 == zero ? zero : one; \ + accumulator += value.s2 == zero ? zero : one; \ + accumulator += value.s3 == zero ? zero : one; \ + accumulator += value.s4 == zero ? zero : one; \ + accumulator += value.s5 == zero ? zero : one; \ + accumulator += value.s6 == zero ? zero : one; \ + accumulator += value.s7 == zero ? zero : one; \ + accumulator += value.s8 == zero ? zero : one; \ + accumulator += value.s9 == zero ? zero : one; \ + accumulator += value.sA == zero ? zero : one; \ + accumulator += value.sB == zero ? zero : one; \ + accumulator += value.sC == zero ? zero : one; \ + accumulator += value.sD == zero ? zero : one; \ + accumulator += value.sE == zero ? zero : one; \ + accumulator += value.sF == zero ? zero : one +#else +#error "kercn should be either 1, 4 or 16" +#endif #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -316,14 +350,14 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset { int lid = get_local_id(0); int gid = get_group_id(0); - int id = get_global_id(0); + int id = get_global_id(0) * kercn; srcptr += src_offset; DECLARE_LOCAL_MEM; DEFINE_ACCUMULATOR; - for (int grain = groupnum * WGS; id < total; id += grain) + for (int grain = groupnum * WGS * kercn; id < total; id += grain) { #ifdef HAVE_SRC_CONT int src_index = mul24(id, srcTSIZE); diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 37cd8eb..e9fc538 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -645,7 +645,7 @@ namespace cv { static bool ocl_countNonZero( InputArray _src, int & res ) { - int type = _src.type(), depth = CV_MAT_DEPTH(type); + int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (depth == CV_64F && !doubleSupport) @@ -660,9 +660,10 @@ static bool ocl_countNonZero( InputArray _src, int & res ) wgs2_aligned >>= 1; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d " - "-D WGS2_ALIGNED=%d%s%s", - ocl::typeToStr(type), (int)wgs, + format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO -D WGS=%d " + "-D kercn=%d -D WGS2_ALIGNED=%d%s%s", + ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), + ocl::typeToStr(depth), (int)wgs, kercn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); if (k.empty()) -- 2.7.4