#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
+#if kercn == 1
#define srcTSIZE (int)sizeof(srcT)
+#else
+#define srcTSIZE (int)sizeof(srcT1)
+#endif
#define dstTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
FUNC(accumulator, temp, temp2)
#else
+#if kercn == 1
#define REDUCE_GLOBAL \
- dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+ FUNC(accumulator, temp.s0); \
+ FUNC(accumulator, temp.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+ FUNC(accumulator, temp.s0); \
+ FUNC(accumulator, temp.s1); \
+ FUNC(accumulator, temp.s2); \
+ FUNC(accumulator, temp.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+ FUNC(accumulator, temp.s0); \
+ FUNC(accumulator, temp.s1); \
+ FUNC(accumulator, temp.s2); \
+ FUNC(accumulator, temp.s3); \
+ FUNC(accumulator, temp.s4); \
+ FUNC(accumulator, temp.s5); \
+ FUNC(accumulator, temp.s6); \
+ FUNC(accumulator, temp.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+ FUNC(accumulator, temp.s0); \
+ FUNC(accumulator, temp.s1); \
+ FUNC(accumulator, temp.s2); \
+ FUNC(accumulator, temp.s3); \
+ FUNC(accumulator, temp.s4); \
+ FUNC(accumulator, temp.s5); \
+ FUNC(accumulator, temp.s6); \
+ FUNC(accumulator, temp.s7); \
+ FUNC(accumulator, temp.s8); \
+ FUNC(accumulator, temp.s9); \
+ FUNC(accumulator, temp.sA); \
+ FUNC(accumulator, temp.sB); \
+ FUNC(accumulator, temp.sC); \
+ FUNC(accumulator, temp.sD); \
+ FUNC(accumulator, temp.sE); \
+ FUNC(accumulator, temp.sF)
+#endif
#endif
#define SET_LOCAL_1 \
#if kercn == 1
#define REDUCE_GLOBAL \
accumulator += loadpix(srcptr + src_index) == zero ? zero : one
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+ srcT value = loadpix(srcptr + src_index); \
+ accumulator += value.s0 == zero ? zero : one; \
+ accumulator += value.s1 == zero ? zero : one
#elif kercn == 4
#define REDUCE_GLOBAL \
srcT value = loadpix(srcptr + src_index); \
accumulator += value.s1 == zero ? zero : one; \
accumulator += value.s2 == zero ? zero : one; \
accumulator += value.s3 == zero ? zero : one
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+ srcT value = loadpix(srcptr + src_index); \
+ accumulator += value.s0 == zero ? zero : one; \
+ accumulator += value.s1 == zero ? zero : one; \
+ accumulator += value.s2 == zero ? zero : one; \
+ accumulator += value.s3 == zero ? zero : one; \
+ accumulator += value.s4 == zero ? zero : one; \
+ accumulator += value.s5 == zero ? zero : one; \
+ accumulator += value.s6 == zero ? zero : one; \
+ accumulator += value.s7 == zero ? zero : one
#elif kercn == 16
#define REDUCE_GLOBAL \
srcT value = loadpix(srcptr + src_index); \
accumulator += value.sD == zero ? zero : one; \
accumulator += value.sE == zero ? zero : one; \
accumulator += value.sF == zero ? zero : one
-#else
-#error "kercn should be either 1, 4 or 16"
#endif
+
#define SET_LOCAL_1 \
localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \
{
CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
- int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+ haveMask = _mask.kind() != _InputArray::NONE;
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
+ mcn = std::max(cn, kercn);
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false;
int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
dtype = CV_MAKE_TYPE(ddepth, cn);
- bool haveMask = _mask.kind() != _InputArray::NONE;
CV_Assert(!haveMask || _mask.type() == CV_8UC1);
int wgs2_aligned = 1;
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
char cvt[40];
- ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
- format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
- " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s",
- ocl::typeToStr(type), ocl::typeToStr(depth),
- ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
- ocl::convertTypeStr(depth, ddepth, cn, cvt),
+ String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
+ " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d",
+ ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
+ ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
+ ocl::typeToStr(ddepth), ddepth, cn,
+ ocl::convertTypeStr(depth, ddepth, mcn, cvt),
opMap[sum_op], (int)wgs, wgs2_aligned,
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveMask ? " -D HAVE_MASK" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
- _mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""));
+ _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
+
+ ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
if (k.empty())
return false;
wgs2_aligned >>= 1;
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
- format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO -D WGS=%d "
- "-D kercn=%d -D WGS2_ALIGNED=%d%s%s",
+ format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
+ " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(depth), (int)wgs, kercn,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
(_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
- int type = _src.type(), depth = CV_MAT_DEPTH(type);
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = 1;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (depth == CV_64F && !doubleSupport)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
- String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s",
+ String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d"
+ " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
- _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "");
+ _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
if (k.empty())
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
{
- int type = _src1.type(), depth = CV_MAT_DEPTH(type);
+ int type = _src1.type(), depth = CV_MAT_DEPTH(type), kercn = 1;
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( !doubleSupport && depth == CV_64F )
char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
- format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s",
- ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt),
+ format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
+ "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
+ ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth,
+ ocl::convertTypeStr(depth, ddepth, 1, cvt),
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
- _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""));
+ _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
if (k.empty())
return false;