VENDOR_NVIDIA=3
};
int vendorID() const;
- inline bool isAMD() const { return vendorID() == VENDOR_AMD; };
- inline bool isIntel() const { return vendorID() == VENDOR_INTEL; };
+ inline bool isAMD() const { return vendorID() == VENDOR_AMD; }
+ inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }
int maxClockFrequency() const;
int maxComputeUnits() const;
{
public:
enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 };
- KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
+ KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0);
KernelArg();
static KernelArg Local() { return KernelArg(LOCAL, 0); }
{ return KernelArg(PTR_ONLY+READ_ONLY, (UMat*)&m); }
static KernelArg PtrReadWrite(const UMat& m)
{ return KernelArg(PTR_ONLY+READ_WRITE, (UMat*)&m); }
- static KernelArg ReadWrite(const UMat& m, int wscale=1)
- { return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
- static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
- { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
- static KernelArg ReadOnly(const UMat& m, int wscale=1)
- { return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
- static KernelArg WriteOnly(const UMat& m, int wscale=1)
- { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
- static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
- { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
- static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
- { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
+ static KernelArg ReadWrite(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(READ_WRITE, (UMat*)&m, wscale, iwscale); }
+ static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+ static KernelArg ReadOnly(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(READ_ONLY, (UMat*)&m, wscale, iwscale); }
+ static KernelArg WriteOnly(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale, iwscale); }
+ static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+ static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+ { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
static KernelArg Constant(const Mat& m);
template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
- { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }
+ { return KernelArg(CONSTANT, 0, 1, 1, (void*)arr, n); }
int flags;
UMat* m;
const void* obj;
size_t sz;
- int wscale;
+ int wscale, iwscale;
};
CV_EXPORTS const char* memopTypeToStr(int t);
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
+CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+ InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+ InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
class CV_EXPORTS Image2D
{
int cn = CV_MAT_CN(srctype);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
(!doubleSupport && srcdepth == CV_64F && !bitwise))
return false;
char opts[1024];
- int kercn = haveMask || haveScalar ? cn : 1;
+ int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn;
sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
- (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
+ haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
kercn);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
- if( k.empty() )
+ if (k.empty())
return false;
UMat src1 = _src1.getUMat(), src2;
UMat dst = _dst.getUMat(), mask = _mask.getUMat();
- int cscale = cn/kercn;
- ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
- ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
- ocl::KernelArg::WriteOnly(dst, cscale);
+ ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
+ ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
+ ocl::KernelArg::WriteOnly(dst, cn, kercn);
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
if( haveScalar )
convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
}
- ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+ ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
if( !haveMask )
k.args(src1arg, dstarg, scalararg);
else
{
src2 = _src2.getUMat();
- ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+ ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
if( !haveMask )
k.args(src1arg, src2arg, dstarg);
k.args(src1arg, src2arg, maskarg, dstarg);
}
- size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
+ size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
return k.run(2, globalsize, 0, false);
}
if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
return false;
- int kercn = haveMask || haveScalar ? cn : 1;
+ int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
int scalarcn = kercn == 3 ? 4 : kercn;
char cvtstr[4][32], opts[1024];
UMat src1 = _src1.getUMat(), src2;
UMat dst = _dst.getUMat(), mask = _mask.getUMat();
- int cscale = cn/kercn;
-
- ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
- ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
- ocl::KernelArg::WriteOnly(dst, cscale);
+ ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
+ ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
+ ocl::KernelArg::WriteOnly(dst, cn, kercn);
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
if( haveScalar )
if( !src2sc.empty() )
convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
- ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+ ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
if( !haveMask )
{
k.args(src1arg, dstarg, scalararg);
else if(n == 1)
k.args(src1arg, dstarg, scalararg,
- ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
+ ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
else
CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
}
else
{
src2 = _src2.getUMat();
- ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+ ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
if( !haveMask )
{
k.args(src1arg, src2arg, dstarg);
else if(n == 1)
k.args(src1arg, src2arg, dstarg,
- ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
+ ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
else if(n == 3)
k.args(src1arg, src2arg, dstarg,
- ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz),
- ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
- ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
+ ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
+ ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
+ ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
else
CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
}
k.args(src1arg, src2arg, maskarg, dstarg);
}
- size_t globalsize[] = { src1.cols * cscale, src1.rows };
+ size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
{
- int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ kercn = cn > 4 || cn == 3 ? 1 : ocl::predictOptimalVectorWidth(_src, _dst);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F)
char cvt[2][50];
int wdepth = std::max(depth, CV_32F);
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=uchar -D srcT1=%s"
- " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s%s",
- ocl::typeToStr(depth), ocl::typeToStr(wdepth), wdepth,
- ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
- ocl::convertTypeStr(wdepth, CV_8U, 1, cvt[1]),
+ format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
+ " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s",
+ ocl::typeToStr(CV_8UC(kercn)),
+ ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+ ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
+ ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
+ ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
+ ocl::typeToStr(wdepth),
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
- _dst.createSameSize(_src, CV_8UC(cn));
- UMat src = _src.getUMat(), dst = _dst.getUMat();
+ UMat src = _src.getUMat();
+ _dst.create(src.size(), CV_8UC(cn));
+ UMat dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
- dstarg = ocl::KernelArg::WriteOnly(dst, cn);
+ dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
if (wdepth == CV_32F)
k.args(srcarg, dstarg, (float)alpha, (float)beta);
else if (wdepth == CV_64F)
k.args(srcarg, dstarg, alpha, beta);
- size_t globalsize[2] = { src.cols * cn, src.rows };
+ size_t globalsize[2] = { src.cols * cn / kercn, src.rows };
return k.run(2, globalsize, NULL, false);
}
static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, int oclop)
{
- int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn1 = CV_MAT_CN(type1);
- int type2 = _src2.type(), cn2 = CV_MAT_CN(type2);
-
- char opts[1024];
+ int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+ int kercn = cn == 3 || cn > 4 || oclop == OCL_OP_PHASE_DEGREES ||
+ oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
- bool double_support = false;
- if(ocl::Device::getDefault().doubleFPConfig() > 0)
- double_support = true;
- if(!double_support && depth1 == CV_64F)
+ bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0;
+ if (!double_support && depth == CV_64F)
return false;
- sprintf(opts, "-D %s -D %s -D dstT=%s %s", _src2.empty()?"UNARY_OP":"BINARY_OP",
- oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, 1) ), double_support ? "-D DOUBLE_SUPPORT" : "" );
-
- ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
- if( k.empty() )
+ ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
+ format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP",
+ oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+ double_support ? " -D DOUBLE_SUPPORT" : ""));
+ if (k.empty())
return false;
- UMat src1 = _src1.getUMat();
- UMat src2 = _src2.getUMat();
- _dst.create(src1.size(), type1);
+ UMat src1 = _src1.getUMat(), src2 = _src2.getUMat();
+ _dst.create(src1.size(), type);
UMat dst = _dst.getUMat();
- ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn1);
- ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn2);
- ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(dst, cn1);
+ ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
+ src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+ dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
- if(_src2.empty())
+ if (src2.empty())
k.args(src1arg, dstarg);
else
k.args(src1arg, src2arg, dstarg);
- size_t globalsize[] = { src1.cols*cn1, src1.rows};
-
+ size_t globalsize[] = { src1.cols * cn / kercn, src1.rows };
return k.run(2, globalsize, 0, false);
}
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
{
- int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
+ int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F),
+ kercn = cn == 3 || cn > 4 ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
Size size = _src1.size();
char cvt[2][50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
- format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D wdepth=%d -D convertToWT1=%s"
- " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s%s", ocl::typeToStr(depth),
- ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
- ocl::convertTypeStr(wdepth, depth, 1, cvt[1]),
+ format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
+ " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s",
+ ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+ ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
+ ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
+ ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
+ ocl::typeToStr(wdepth), wdepth,
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
if (k.empty())
return false;
+ UMat src1 = _src1.getUMat(), src2 = _src2.getUMat();
_dst.create(size, type);
- UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
+ UMat dst = _dst.getUMat();
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
- dstarg = ocl::KernelArg::WriteOnly(dst, cn);
+ dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
if (wdepth == CV_32F)
k.args(src1arg, src2arg, dstarg, (float)alpha);
else
k.args(src1arg, src2arg, dstarg, alpha);
- size_t globalsize[2] = { dst.cols * cn, dst.rows };
+ size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
return k.run(2, globalsize, NULL, false);
}
/////////////////////////////////////////// KernelArg /////////////////////////////////////////////
KernelArg::KernelArg()
- : flags(0), m(0), obj(0), sz(0), wscale(1)
+ : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
{
}
-KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz)
- : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale)
+KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
+ : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
{
}
KernelArg KernelArg::Constant(const Mat& m)
{
CV_Assert(m.isContinuous());
- return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize());
+ return KernelArg(CONSTANT, 0, 0, 0, m.data, m.total()*m.elemSize());
}
/////////////////////////////////////////// Kernel /////////////////////////////////////////////
if( !(arg.flags & KernelArg::NO_SIZE) )
{
- int cols = u2d.cols*arg.wscale;
+ int cols = u2d.cols*arg.wscale/arg.iwscale;
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
i += 2;
i += 4;
if( !(arg.flags & KernelArg::NO_SIZE) )
{
- int cols = u3d.cols*arg.wscale;
+ int cols = u3d.cols*arg.wscale/arg.iwscale;
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
for (int i = 0; i < dims; i++)
{
size_t val = _localsize ? _localsize[i] :
- dims == 1 ? 64 : dims == 2 ? (16>>i) : dims == 3 ? (8>>(int)(i>0)) : 1;
+ dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
CV_Assert( val > 0 );
total *= _globalsize[i];
globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
{
static const char* tab[]=
{
- "uchar", "uchar2", "uchar3", "uchar4",
- "char", "char2", "char3", "char4",
- "ushort", "ushort2", "ushort3", "ushort4",
- "short", "short2", "short3", "short4",
- "int", "int2", "int3", "int4",
- "float", "float2", "float3", "float4",
- "double", "double2", "double3", "double4",
- "?", "?", "?", "?"
+ "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
+ "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
+ "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
+ "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
+ "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
+ "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
+ "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
+ "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
};
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
- return cn > 4 ? "?" : tab[depth*4 + cn-1];
+ return cn > 16 ? "?" : tab[depth*16 + cn-1];
}
const char* memopTypeToStr(int type)
{
static const char* tab[] =
{
- "uchar", "uchar2", "uchar3", "uchar4",
- "uchar", "uchar2", "uchar3", "uchar4",
- "ushort", "ushort2", "ushort3", "ushort4",
- "ushort", "ushort2", "ushort3", "ushort4",
- "int", "int2", "int3", "int4",
- "int", "int2", "int3", "int4",
- "ulong", "ulong2", "ulong3", "ulong4",
- "?", "?", "?", "?"
+ "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
+ "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
+ "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
+ "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
+ "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
+ "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
+ "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
+ "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
};
int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
- return cn > 4 ? "?" : tab[depth*4 + cn-1];
+ return cn > 16 ? "?" : tab[depth*16 + cn-1];
}
const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
return cv::format(" -D COEFF=%s", func(kernel).c_str());
}
+#define PROCESS_SRC(src) \
+ do \
+ { \
+ if (!src.empty()) \
+ { \
+ CV_Assert(src.isMat() || src.isUMat()); \
+ int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
+ Size csize = src.size(); \
+ cols.push_back(ccn * src.size().width); \
+ if (ctype != type || csize != ssize) \
+ return 1; \
+ offsets.push_back(src.offset()); \
+ steps.push_back(src.step()); \
+ } \
+ } \
+ while ((void)0, 0)
+
+int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
+ InputArray src4, InputArray src5, InputArray src6,
+ InputArray src7, InputArray src8, InputArray src9)
+{
+ int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+ Size ssize = src1.size();
+ const ocl::Device & d = ocl::Device::getDefault();
+
+ int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
+ d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
+ d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
+ d.preferredVectorWidthDouble(), -1 }, width = vectorWidths[depth];
+ CV_Assert(width >= 0);
+
+ if (ssize.width * cn < width)
+ return 1;
+
+ std::vector<size_t> offsets, steps, cols;
+ PROCESS_SRC(src1);
+ PROCESS_SRC(src2);
+ PROCESS_SRC(src3);
+ PROCESS_SRC(src4);
+ PROCESS_SRC(src5);
+ PROCESS_SRC(src6);
+ PROCESS_SRC(src7);
+ PROCESS_SRC(src8);
+ PROCESS_SRC(src9);
+
+ size_t size = offsets.size();
+ std::vector<int> dividers(size, width);
+
+ for (size_t i = 0; i < size; ++i)
+ while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
+ dividers[i] >>= 1;
+
+ // default strategy
+ for (size_t i = 0; i < size; ++i)
+ if (dividers[i] != width)
+ {
+ width = 1;
+ break;
+ }
+
+ // another strategy
+// width = *std::min_element(dividers.begin(), dividers.end());
+
+ return width;
+}
+
+#undef PROCESS_SRC
+
/////////////////////////////////////////// Image2D ////////////////////////////////////////////////////
struct Image2D::Impl
#elif defined OP_CONVERT_SCALE_ABS
#undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT alpha, workT beta
+#define EXTRA_PARAMS , workT1 alpha, workT1 beta
#if wdepth <= 4
#define PROCESS_ELEM \
- workT value = mad24(srcelem1, alpha, beta); \
+ workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \
storedst(convertToDT(value >= 0 ? value : -value))
#else
#define PROCESS_ELEM \
- workT value = mad(srcelem1, alpha, beta); \
+ workT value = mad(srcelem1, (workT)(alpha), (workT)(beta)); \
storedst(convertToDT(value >= 0 ? value : -value))
#endif
#elif defined OP_SCALE_ADD
#undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT alpha
+#define EXTRA_PARAMS , workT1 alpha
#if wdepth <= 4
-#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, srcelem2)))
+#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))
#else
-#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, srcelem2)))
+#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, (workT)(alpha), srcelem2)))
#endif
#elif defined OP_CTP_AD || defined OP_CTP_AR
ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
if( !setK.empty() )
{
- ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn);
+ ocl::KernelArg scalararg(0, 0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn);
UMat mask;
if( haveMask )
matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
- ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
+ ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
size_t globalThreads[2] = { dst.cols, dst.rows };
return k.run(2, globalThreads, NULL, false);
static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
{
- int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), ktype = CV_MAKE_TYPE(depth, 1);
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+ kercn = cn <= 4 && cn != 3 ? cn : ocl::predictOptimalVectorWidth(_src, _dst),
+ ktype = CV_MAKE_TYPE(depth, kercn);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( !(thresh_type == THRESH_BINARY || thresh_type == THRESH_BINARY_INV || thresh_type == THRESH_TRUNC ||
if (depth <= CV_32S)
thresh = cvFloor(thresh);
- k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn),
- ocl::KernelArg::Constant(Mat(1, 1, ktype, thresh)),
- ocl::KernelArg::Constant(Mat(1, 1, ktype, maxval)));
+ k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn),
+ ocl::KernelArg::Constant(Mat(1, 1, ktype, Scalar::all(thresh))),
+ ocl::KernelArg::Constant(Mat(1, 1, ktype, Scalar::all(maxval))));
- size_t globalsize[2] = { dst.cols * cn, dst.rows };
+ size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows };
return k.run(2, globalsize, NULL, false);
}