\r
namespace\r
{\r
- typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
- typedef NppStatus (*npp_arithm_16u_t)(const Npp16u* pSrc1, int nSrc1Step, const Npp16u* pSrc2, int nSrc2Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
- typedef NppStatus (*npp_arithm_16s_t)(const Npp16s* pSrc1, int nSrc1Step, const Npp16s* pSrc2, int nSrc2Step, Npp16s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
- typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
- typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);\r
+ template<int DEPTH> struct NppTypeTraits;\r
+ template<> struct NppTypeTraits<CV_8U> { typedef Npp8u npp_t; };\r
+ template<> struct NppTypeTraits<CV_8S> { typedef Npp8s npp_t; };\r
+ template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };\r
+ template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };\r
+ template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };\r
+ template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };\r
+ template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };\r
\r
- bool nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,\r
- npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,\r
- npp_arithm_16u_t npp_func_16uc1, npp_arithm_16u_t npp_func_16uc4,\r
- npp_arithm_16s_t npp_func_16sc1, npp_arithm_16s_t npp_func_16sc4,\r
- npp_arithm_32s_t npp_func_32sc1, \r
- npp_arithm_32f_t npp_func_32fc1, npp_arithm_32f_t npp_func_32fc4,\r
- cudaStream_t stream)\r
+ template <int DEPTH> struct NppArithmFunc\r
{\r
- bool useNpp = (src1.depth() == CV_8U || src1.depth() == CV_16U || src1.depth() == CV_16S || src1.depth() == CV_32S || src1.depth() == CV_32F);\r
+ typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;\r
+ \r
+ typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);\r
+ };\r
+ template <> struct NppArithmFunc<CV_32F>\r
+ { \r
+ typedef NppTypeTraits<CV_32F>::npp_t npp_t;\r
\r
- if (!useNpp)\r
- return false;\r
+ typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);\r
+ };\r
\r
- bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16);\r
+ template <int DEPTH, typename NppArithmFunc<DEPTH>::func_t func> struct NppArithm\r
+ {\r
+ typedef typename NppArithmFunc<DEPTH>::npp_t npp_t;\r
\r
- NppiSize sz;\r
- sz.width = src1.cols * src1.channels();\r
- sz.height = src1.rows;\r
+ static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+ {\r
+ NppStreamHandler h(stream);\r
\r
- NppStreamHandler h(stream);\r
+ NppiSize sz;\r
+ sz.width = src1.cols;\r
+ sz.height = src1.rows;\r
\r
- if (aligned && src1.depth() == CV_8U && (sz.width % 4) == 0)\r
- {\r
- sz.width /= 4;\r
+ nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), \r
+ (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );\r
\r
- nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 0) );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- else if (src1.depth() == CV_8U)\r
+ static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
{\r
- nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 0) );\r
+ call(src1, src2, dst, PtrStepb(), stream);\r
}\r
- else if (aligned && src1.depth() == CV_16U && (sz.width % 4) == 0)\r
- {\r
- sz.width /= 4;\r
+ };\r
+ template <typename NppArithmFunc<CV_32F>::func_t func> struct NppArithm<CV_32F, func>\r
+ {\r
+ typedef typename NppArithmFunc<CV_32F>::npp_t npp_t;\r
\r
- nppSafeCall( npp_func_16uc4(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, 0) );\r
- }\r
- else if (src1.depth() == CV_16U)\r
+ static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
{\r
- nppSafeCall( npp_func_16uc1(src1.ptr<Npp16u>(), static_cast<int>(src1.step), src2.ptr<Npp16u>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, 0) );\r
- }\r
- else if (aligned && src1.depth() == CV_16S && (sz.width % 4) == 0)\r
- {\r
- sz.width /= 4;\r
+ NppStreamHandler h(stream);\r
\r
- nppSafeCall( npp_func_16sc4(src1.ptr<Npp16s>(), static_cast<int>(src1.step), src2.ptr<Npp16s>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, 0) );\r
- }\r
- else if (src1.depth() == CV_16S)\r
- {\r
- nppSafeCall( npp_func_16sc1(src1.ptr<Npp16s>(), static_cast<int>(src1.step), src2.ptr<Npp16s>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, 0) );\r
- }\r
- else if (src1.depth() == CV_32S)\r
- {\r
- nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, 0) );\r
- }\r
- else if (aligned && src1.depth() == CV_32F && (sz.width % 4) == 0)\r
- {\r
- sz.width /= 4;\r
+ NppiSize sz;\r
+ sz.width = src1.cols;\r
+ sz.height = src1.rows;\r
+ \r
+ nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), \r
+ (npp_t*)dst.data, static_cast<int>(dst.step), sz) );\r
\r
- nppSafeCall( npp_func_32fc4(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
- else // if (src1.depth() == CV_32F)\r
+ static void call(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
{\r
- nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), \r
- dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );\r
+ call(src1, src2, dst, PtrStepb(), stream);\r
}\r
-\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-\r
- return true;\r
- }\r
+ };\r
}\r
\r
////////////////////////////////////////////////////////////////////////\r
{0/*add_gpu<double, unsigned char>*/, 0/*add_gpu<double, signed char>*/, 0/*add_gpu<double, unsigned short>*/, 0/*add_gpu<double, short>*/, 0/*add_gpu<double, int>*/, 0/*add_gpu<double, float>*/, add_gpu<double, double>}\r
};\r
\r
+ static const func_t npp_funcs[7] = \r
+ {\r
+ NppArithm<CV_8U, nppiAdd_8u_C1RSfs>::call,\r
+ 0,\r
+ NppArithm<CV_16U, nppiAdd_16u_C1RSfs>::call,\r
+ NppArithm<CV_16S, nppiAdd_16s_C1RSfs>::call,\r
+ NppArithm<CV_32S, nppiAdd_32s_C1RSfs>::call,\r
+ NppArithm<CV_32F, nppiAdd_32f_C1R>::call,\r
+ add_gpu<double, double>\r
+ };\r
+\r
+ CV_Assert(src1.type() != CV_8S);\r
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());\r
CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));\r
\r
\r
if (mask.empty() && dst.type() == src1.type())\r
{\r
- if (nppArithmCaller(src1, src2, dst,\r
- nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, \r
- nppiAdd_16u_C1RSfs, nppiAdd_16u_C4RSfs,\r
- nppiAdd_16s_C1RSfs, nppiAdd_16s_C4RSfs,\r
- nppiAdd_32s_C1RSfs, \r
- nppiAdd_32f_C1R, nppiAdd_32f_C4R, \r
- stream))\r
- {\r
- return;\r
- }\r
+ npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), PtrStepb(), stream);\r
+ return;\r
}\r
\r
const func_t func = funcs[src1.depth()][dst.depth()];\r
\r
namespace\r
{\r
- template<int type> struct NppTypeTraits;\r
- template<> struct NppTypeTraits<CV_8U> { typedef Npp8u npp_t; };\r
- template<> struct NppTypeTraits<CV_8S> { typedef Npp8s npp_t; };\r
- template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };\r
- template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };\r
- template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };\r
- template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };\r
- template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };\r
-\r
template<int DEPTH, int cn> struct NppArithmScalarFunc\r
{\r
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;\r
{0/*subtract_gpu<double, unsigned char>*/, 0/*subtract_gpu<double, signed char>*/, 0/*subtract_gpu<double, unsigned short>*/, 0/*subtract_gpu<double, short>*/, 0/*subtract_gpu<double, int>*/, 0/*subtract_gpu<double, float>*/, subtract_gpu<double, double>}\r
};\r
\r
+ static const func_t npp_funcs[7] = \r
+ {\r
+ NppArithm<CV_8U, nppiSub_8u_C1RSfs>::call,\r
+ 0,\r
+ NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call,\r
+ NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call,\r
+ NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call,\r
+ NppArithm<CV_32F, nppiSub_32f_C1R>::call,\r
+ subtract_gpu<double, double>\r
+ };\r
+\r
+ CV_Assert(src1.type() != CV_8S);\r
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());\r
CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));\r
\r
\r
if (mask.empty() && dst.type() == src1.type())\r
{\r
- if (nppArithmCaller(src2, src1, dst,\r
- nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, \r
- nppiSub_16u_C1RSfs, nppiSub_16u_C4RSfs,\r
- nppiSub_16s_C1RSfs, nppiSub_16s_C4RSfs,\r
- nppiSub_32s_C1RSfs, \r
- nppiSub_32f_C1R, nppiSub_32f_C4R, \r
- stream))\r
- {\r
- return;\r
- }\r
+ npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), PtrStepb(), stream);\r
+ return;\r
}\r
\r
const func_t func = funcs[src1.depth()][dst.depth()];\r
{0/*multiply_gpu<double, unsigned char>*/, 0/*multiply_gpu<double, signed char>*/, 0/*multiply_gpu<double, unsigned short>*/, 0/*multiply_gpu<double, short>*/, 0/*multiply_gpu<double, int>*/, 0/*multiply_gpu<double, float>*/, multiply_gpu<double, double>}\r
};\r
\r
+ static const func_t npp_funcs[7] = \r
+ {\r
+ NppArithm<CV_8U, nppiMul_8u_C1RSfs>::call,\r
+ 0,\r
+ NppArithm<CV_16U, nppiMul_16u_C1RSfs>::call,\r
+ NppArithm<CV_16S, nppiMul_16s_C1RSfs>::call,\r
+ NppArithm<CV_32S, nppiMul_32s_C1RSfs>::call,\r
+ NppArithm<CV_32F, nppiMul_32f_C1R>::call,\r
+ multiply_gpu<double, double>\r
+ };\r
+\r
cudaStream_t stream = StreamAccessor::getStream(s);\r
\r
if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)\r
}\r
else\r
{\r
+ CV_Assert(src1.type() != CV_8S);\r
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());\r
\r
if (dtype < 0)\r
\r
if (scale == 1 && dst.type() == src1.type())\r
{\r
- if (nppArithmCaller(src1, src2, dst,\r
- nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, \r
- nppiMul_16u_C1RSfs, nppiMul_16u_C4RSfs,\r
- nppiMul_16s_C1RSfs, nppiMul_16s_C4RSfs,\r
- nppiMul_32s_C1RSfs, \r
- nppiMul_32f_C1R, nppiMul_32f_C4R, \r
- stream))\r
- {\r
- return;\r
- }\r
+ npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), 1, stream);\r
+ return;\r
}\r
\r
const func_t func = funcs[src1.depth()][dst.depth()];\r
{0/*divide_gpu<double, unsigned char>*/, 0/*divide_gpu<double, signed char>*/, 0/*divide_gpu<double, unsigned short>*/, 0/*divide_gpu<double, short>*/, 0/*divide_gpu<double, int>*/, 0/*divide_gpu<double, float>*/, divide_gpu<double, double>}\r
};\r
\r
+ static const func_t npp_funcs[7] = \r
+ {\r
+ NppArithm<CV_8U, nppiDiv_8u_C1RSfs>::call,\r
+ 0,\r
+ NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call,\r
+ NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call,\r
+ NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call,\r
+ NppArithm<CV_32F, nppiDiv_32f_C1R>::call,\r
+ divide_gpu<double, double>\r
+ };\r
+\r
cudaStream_t stream = StreamAccessor::getStream(s);\r
\r
if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)\r
multiply_gpu(static_cast<DevMem2D_<short4> >(src1), static_cast<DevMem2Df>(src2), static_cast<DevMem2D_<short4> >(dst), stream);\r
}\r
else\r
- {\r
+ { \r
+ CV_Assert(src1.type() != CV_8S);\r
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());\r
\r
if (dtype < 0)\r
\r
if (scale == 1 && dst.type() == src1.type())\r
{\r
- if (nppArithmCaller(src2, src1, dst,\r
- nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, \r
- nppiDiv_16u_C1RSfs, nppiDiv_16u_C4RSfs,\r
- nppiDiv_16s_C1RSfs, nppiDiv_16s_C4RSfs,\r
- nppiDiv_32s_C1RSfs, \r
- nppiDiv_32f_C1R, nppiDiv_32f_C4R, \r
- stream))\r
- {\r
- return;\r
- }\r
+ npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), 1, stream);\r
+ return;\r
}\r
\r
const func_t func = funcs[src1.depth()][dst.depth()];\r