int srcdepth = CV_MAT_DEPTH(srctype);
int cn = CV_MAT_CN(srctype);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device d = ocl::Device::getDefault();
+ bool doubleSupport = d.doubleFPConfig() > 0;
if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
(!doubleSupport && srcdepth == CV_64F && !bitwise))
return false;
char opts[1024];
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn;
+ int rowsPerWI = d.isIntel() ? 4 : 1;
- sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
+ sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
- kercn);
+ kercn, rowsPerWI);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
if (k.empty())
k.args(src1arg, src2arg, maskarg, dstarg);
}
- size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+ size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, 0, false);
}
void* usrdata, int oclop,
bool haveScalar )
{
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device d = ocl::Device::getDefault();
+ bool doubleSupport = d.doubleFPConfig() > 0;
int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
bool haveMask = !_mask.empty();
return false;
int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
- int scalarcn = kercn == 3 ? 4 : kercn;
+ int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
char cvtstr[4][32], opts[1024];
sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
"-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
- "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
+ "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d",
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
- doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn);
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI);
size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
const uchar* usrdata_p = (const uchar*)usrdata;
k.args(src1arg, src2arg, maskarg, dstarg);
}
- size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+ size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
return false;
- int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
+ int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
// Workaround for bug with "?:" operator in AMD OpenCL compiler
if (depth1 >= CV_16U)
kercn = 1;
String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
" -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
- " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
+ " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
haveScalar ? "UNARY_OP" : "BINARY_OP",
ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
ocl::typeToStr(CV_8UC(kercn)), kercn,
ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
operationMap[op], ocl::typeToStr(depth1),
ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
- ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
+ ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
ocl::KernelArg::WriteOnly(dst, cn, kercn));
}
- size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
+ size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
{
+ const ocl::Device & d = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
- kercn = ocl::predictOptimalVectorWidth(_src, _dst);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F)
return false;
int wdepth = std::max(depth, CV_32F);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
- " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s",
+ " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
+ " -D workT1=%s -D rowsPerWI=%d%s",
ocl::typeToStr(CV_8UC(kercn)),
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
- ocl::typeToStr(wdepth),
+ ocl::typeToStr(wdepth), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
else if (wdepth == CV_64F)
k.args(srcarg, dstarg, alpha, beta);
- size_t globalsize[2] = { src.cols * cn / kercn, src.rows };
+ size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
int kercn = oclop == OCL_OP_PHASE_DEGREES ||
oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
- bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device d = ocl::Device::getDefault();
+ bool double_support = d.doubleFPConfig() > 0;
if (!double_support && depth == CV_64F)
return false;
+ int rowsPerWI = d.isIntel() ? 4 : 1;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
- oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+ format("-D %s -D %s -D dstT=%s -D rowsPerWI=%d%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
+ oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), rowsPerWI,
double_support ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
else
k.args(src1arg, src2arg, dstarg);
- size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
+ size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, 0, false);
}
static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
{
- int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device & d = ocl::Device::getDefault();
+ int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 &&
(depth == CV_32F || depth == CV_64F) && type == _src2.type()) ||
return false;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D BINARY_OP -D dstT=%s -D depth=%d -D OP_CTP_%s%s",
+ format("-D BINARY_OP -D dstT=%s -D depth=%d -D rowsPerWI=%d -D OP_CTP_%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
- depth, angleInDegrees ? "AD" : "AR",
+ depth, rowsPerWI, angleInDegrees ? "AD" : "AR",
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
ocl::KernelArg::WriteOnly(dst1, cn),
ocl::KernelArg::WriteOnlyNoSize(dst2));
- size_t globalsize[2] = { dst1.cols * cn, dst1.rows };
+ size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
{
- int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device & d = ocl::Device::getDefault();
+ int type = _angle.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
if ( !doubleSupport && depth == CV_64F )
return false;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D dstT=%s -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
- ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth,
- angleInDegrees ? "AD" : "AR",
+ format("-D dstT=%s -D rowsPerWI=%d -D depth=%d -D BINARY_OP -D OP_PTC_%s%s",
+ ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), rowsPerWI,
+ depth, angleInDegrees ? "AD" : "AR",
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle),
ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2));
- size_t globalsize[2] = { dst1.cols * cn, dst1.rows };
+ size_t globalsize[2] = { dst1.cols * cn, (dst1.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
bool is_ipower, int ipower)
{
- int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device & d = ocl::Device::getDefault();
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
if (depth == CV_64F && !doubleSupport)
return false;
const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
- op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+ format("-D dstT=%s -D rowsPerWI=%d -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
+ rowsPerWI, op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
k.args(srcarg, dstarg, power);
}
- size_t globalsize[2] = { dst.cols * cn, dst.rows };
+ size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_patchNaNs( InputOutputArray _a, float value )
{
+ int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int"));
+ format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d",
+ rowsPerWI));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(a),
ocl::KernelArg::WriteOnly(a, cn), (float)value);
- size_t globalsize[2] = { a.cols * cn, a.rows };
+ size_t globalsize[2] = { a.cols * cn, (a.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
{
+ const ocl::Device & d = ocl::Device::getDefault();
int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F),
- kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
Size size = _src1.size();
if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
char cvt[2][50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
- " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s",
+ " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
+ " -D wdepth=%d%s -D rowsPerWI=%d",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
ocl::typeToStr(wdepth), wdepth,
- doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
if (k.empty())
return false;
else
k.args(src1arg, src2arg, dstarg, alpha);
- size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
+ size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
#define EXTRA_PARAMS
#define EXTRA_INDEX
+#define EXTRA_INDEX_ADD
#if defined OP_ADD
#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
#undef EXTRA_PARAMS
#define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
#undef EXTRA_INDEX
- #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
+ #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
+ #undef EXTRA_INDEX_ADD
+ #define EXTRA_INDEX_ADD dst_index2 += dststep2
#endif
#if defined UNARY_OP || defined MASK_UNARY_OP
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y0 = get_global_id(1) * rowsPerWI;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+ int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
- int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
+ int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
#endif
- int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+ int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
EXTRA_INDEX;
- PROCESS_ELEM;
+ for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
+ {
+ PROCESS_ELEM;
+#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
+ src2_index += srcstep2;
+#endif
+ EXTRA_INDEX_ADD;
+ }
}
}
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y0 = get_global_id(1) * rowsPerWI;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int mask_index = mad24(y, maskstep, x + maskoffset);
- if( mask[mask_index] )
- {
- int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
- int src2_index = mad24(y, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
- int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
-
- PROCESS_ELEM;
- }
+ int mask_index = mad24(y0, maskstep, x + maskoffset);
+ int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+ int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
+ int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+
+ for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,
+ mask_index += maskstep, dst_index += dststep)
+ if (mask[mask_index])
+ {
+ PROCESS_ELEM;
+ }
}
}
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y0 = get_global_id(1) * rowsPerWI;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
- int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+ int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+ int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
- PROCESS_ELEM;
+ for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
+ {
+ PROCESS_ELEM;
+ }
}
}
int rows, int cols EXTRA_PARAMS )
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y0 = get_global_id(1);
- if (x < cols && y < rows)
+ if (x < cols)
{
- int mask_index = mad24(y, maskstep, x + maskoffset);
- if( mask[mask_index] )
- {
- int src1_index = mad24(y, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
- int dst_index = mad24(y, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
-
- PROCESS_ELEM;
- }
+ int mask_index = mad24(y0, maskstep, x + maskoffset);
+ int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
+ int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
+
+ for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)
+ if (mask[mask_index])
+ {
+ PROCESS_ELEM;
+ }
}
}
static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
{
+ const ocl::Device & d = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+ bool doubleSupport = d.doubleFPConfig() > 0,
haveMask = _mask.kind() != _InputArray::NONE;
if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
if (depth != CV_8U && depth != CV_16U)
{
- int wdepth = std::max(CV_32S, depth);
+ int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1;
char cvt[50];
ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc,
- format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s -D convertToDT=%s%s",
+ format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s"
+ " -D convertToDT=%s -D rowsPerWI=%d%s",
ocl::typeToStr(wdepth), ocl::typeToStr(depth),
- ocl::convertTypeStr(depth, wdepth, 1, cvt),
+ ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (kabs.empty())
return false;
abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn));
kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn));
- size_t globalsize[2] = { src.cols * cn, src.rows };
+ size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI };
if (!kabs.run(2, globalsize, NULL, false))
return false;
}
}
else
{
- int dbsize = ocl::Device::getDefault().maxComputeUnits();
- size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+ int dbsize = d.maxComputeUnits();
+ size_t wgs = d.maxWorkGroupSize();
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
{
- int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+ const ocl::Device & d = ocl::Device::getDefault();
+ int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
+ bool doubleSupport = d.doubleFPConfig() > 0;
bool relative = (normType & NORM_RELATIVE) != 0;
normType &= ~NORM_RELATIVE;
char cvt[50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
- " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT%s",
+ " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
ocl::typeToStr(wdepth), ocl::typeToStr(depth),
- ocl::convertTypeStr(depth, wdepth, 1, cvt),
+ ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
ocl::KernelArg::WriteOnly(diff, cn));
- size_t globalsize[2] = { diff.cols * cn, diff.rows };
+ size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
if (!k.run(2, globalsize, NULL, false))
return false;