const int min_opt_cols = 128, buf_cols = 32;
int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
- useOptimized = 1 == dim && _src.cols() > min_opt_cols;
+ const ocl::Device &defDev = ocl::Device::getDefault();
+ bool doubleSupport = defDev.doubleFPConfig() > 0;
+
+ size_t wgs = defDev.maxWorkGroupSize();
+ bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols);
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
return false;
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
- char cvt[2][40];
-
int wdepth = std::max(ddepth, CV_32F);
- cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
- " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
- " -D convertToDT=%s -D convertToDT0=%s%s",
- ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
- ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
- ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
- ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
- ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
- doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
if (useOptimized)
{
- cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
- " -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
- buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
- ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
- doubleSupport ? " -D DOUBLE_SUPPORT" : "");
- ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
- if (kpre.empty())
- return false;
-
- ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
- if (kmain.empty())
+ size_t tileHeight = (size_t)(wgs / buf_cols);
+ if (defDev.isIntel())
+ {
+ static const size_t maxItemInGroupCount = 16;
+ tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount);
+ }
+ char cvt[3][40];
+ cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1"
+ " -D cn=%d -D ddepth=%d"
+ " -D srcT=%s -D bufT=%s -D dstT=%s"
+ " -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s",
+ buf_cols, tileHeight, ops[op], cn, ddepth,
+ ocl::typeToStr(sdepth),
+ ocl::typeToStr(ddepth),
+ ocl::typeToStr(ddepth0),
+ ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+ ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]),
+ ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]),
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+ ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt);
+ if (k.empty())
return false;
-
UMat src = _src.getUMat();
Size dsize(1, src.rows);
_dst.create(dsize, dtype);
UMat dst = _dst.getUMat();
- UMat buf(src.rows, buf_cols, dst.type());
-
- kpre.args(ocl::KernelArg::ReadOnly(src),
- ocl::KernelArg::WriteOnlyNoSize(buf));
-
- size_t globalSize[2] = { buf_cols, src.rows };
- if (!kpre.run(2, globalSize, NULL, false))
- return false;
-
if (op0 == CV_REDUCE_AVG)
- kmain.args(ocl::KernelArg::ReadOnly(buf),
- ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
+ k.args(ocl::KernelArg::ReadOnly(src),
+ ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
else
- kmain.args(ocl::KernelArg::ReadOnly(buf),
- ocl::KernelArg::WriteOnlyNoSize(dst));
+ k.args(ocl::KernelArg::ReadOnly(src),
+ ocl::KernelArg::WriteOnlyNoSize(dst));
- globalSize[0] = src.rows;
- return kmain.run(1, globalSize, NULL, false);
+ size_t localSize[2] = { buf_cols, tileHeight};
+ size_t globalSize[2] = { buf_cols, src.rows };
+ return k.run(2, globalSize, localSize, false);
}
+ else
+ {
+ char cvt[2][40];
+ cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
+ " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
+ " -D convertToDT=%s -D convertToDT0=%s%s",
+ ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
+ ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
+ ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+ ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
+ ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+ ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
+ if (k.empty())
+ return false;
- ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
- if (k.empty())
- return false;
-
- UMat src = _src.getUMat();
- Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
- _dst.create(dsize, dtype);
- UMat dst = _dst.getUMat();
+ UMat src = _src.getUMat();
+ Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
+ _dst.create(dsize, dtype);
+ UMat dst = _dst.getUMat();
- ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
- temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
+ ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
+ temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
- if (op0 == CV_REDUCE_AVG)
- k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
- else
- k.args(srcarg, temparg);
+ if (op0 == CV_REDUCE_AVG)
+ k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
+ else
+ k.args(srcarg, temparg);
- size_t globalsize = std::max(dsize.width, dsize.height);
- return k.run(1, &globalsize, NULL, false);
+ size_t globalsize = std::max(dsize.width, dsize.height);
+ return k.run(1, &globalsize, NULL, false);
+ }
}
}
#define PROCESS_ELEM(acc, value) acc += value
#elif defined OCL_CV_REDUCE_MAX
#define INIT_VALUE MIN_VAL
-#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = max(value, acc)
#elif defined OCL_CV_REDUCE_MIN
#define INIT_VALUE MAX_VAL
-#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = min(value, acc)
#else
#error "No operation is specified"
#endif
#ifdef OP_REDUCE_PRE
-__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
- __global uchar * bufptr, int buf_step, int buf_offset)
+__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
+ __global uchar * dstptr, int dst_step, int dst_offset
+#ifdef OCL_CV_REDUCE_AVG
+ , float fscale
+#endif
+ )
{
+ __local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];
+
int x = get_global_id(0);
int y = get_global_id(1);
- if (x < BUF_COLS)
+ int liy = get_local_id(1);
+ if ((x < BUF_COLS) && (y < rows))
{
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
- int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
- __global dstT * buf = (__global dstT *)(bufptr + buf_index);
- dstT tmp[cn] = { INIT_VALUE };
+ bufT tmp[cn] = { INIT_VALUE };
int src_step_mul = BUF_COLS * cn;
for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)
#pragma unroll
for (int c = 0; c < cn; ++c)
{
- dstT value = convertToDT(src[c]);
+ bufT value = convertToBufT(src[c]);
PROCESS_ELEM(tmp[c], value);
}
}
#pragma unroll
for (int c = 0; c < cn; ++c)
- buf[c] = tmp[c];
+ lsmem[liy][x][c] = tmp[c];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if ((x < BUF_COLS / 2) && (y < rows))
+ {
+ #pragma unroll
+ for (int c = 0; c < cn; ++c)
+ {
+ PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x + BUF_COLS / 2][c]);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if ((x == 0) && (y < rows))
+ {
+ int dst_index = mad24(y, dst_step, dst_offset);
+
+ __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+ bufT tmp[cn] = { INIT_VALUE };
+
+ #pragma unroll
+ for (int xin = 0; xin < BUF_COLS / 2; xin ++)
+ {
+ #pragma unroll
+ for (int c = 0; c < cn; ++c)
+ {
+ PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);
+ }
+ }
+
+ #pragma unroll
+ for (int c = 0; c < cn; ++c)
+#ifdef OCL_CV_REDUCE_AVG
+ dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);
+#else
+ dst[c] = convertToDT(tmp[c]);
+#endif
}
}