From: Vladislav Vinogradov Date: Mon, 29 Jul 2013 11:45:50 +0000 (+0400) Subject: used new device layer for cv::gpu::multiply X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~877^2~31 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e7dba695b3941fd124438bce79800d508bddf9eb;p=profile%2Fivi%2Fopencv.git used new device layer for cv::gpu::multiply --- diff --git a/modules/cudaarithm/src/cuda/mul_mat.cu b/modules/cudaarithm/src/cuda/mul_mat.cu index dda5963..f45e4e2 100644 --- a/modules/cudaarithm/src/cuda/mul_mat.cu +++ b/modules/cudaarithm/src/cuda/mul_mat.cu @@ -40,172 +40,185 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm -{ - struct Mul_8uc4_32f : binary_function - { - __device__ __forceinline__ uint operator ()(uint a, float b) const - { - uint res = 0; - - res |= (saturate_cast((0xffu & (a )) * b) ); - res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); - res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); - res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); - - return res; - } - - __host__ __device__ __forceinline__ Mul_8uc4_32f() {} - __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {} - }; +#include "opencv2/cudev.hpp" - struct Mul_16sc4_32f : binary_function - { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const - { - return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), - saturate_cast(a.z * b), saturate_cast(a.w * b)); - } +using namespace cv::cudev; - __host__ __device__ __forceinline__ Mul_16sc4_32f() {} - __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {} - }; +void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int); +void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); +void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); - template struct Mul : binary_function +namespace +{ + template struct MulOp : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a * b); } - - __host__ __device__ __forceinline__ Mul() {} - __host__ __device__ __forceinline__ Mul(const Mul&) {} }; - template struct MulScale : binary_function + template struct MulScaleOp : binary_function { S scale; - __host__ explicit MulScale(S scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(scale * a * b); } }; -} - -namespace cv { namespace cuda { namespace device -{ - template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits - { - }; - template struct TransformFunctorTraits< arithm::Mul > : arithm::ArithmFuncTraits + template struct TransformPolicy : DefaultTransformPolicy { }; - - template struct TransformFunctorTraits< arithm::MulScale > : arithm::ArithmFuncTraits + template <> struct TransformPolicy : DefaultTransformPolicy { + enum { + shift = 1 + }; }; -}}} - -namespace arithm -{ - void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); - } - - void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) - { - device::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); - } template - void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + void mulMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream) { if (scale == 1) { - Mul op; - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + MulOp op; + gridTransformBinary_< TransformPolicy >(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } else { - MulScale op(static_cast(scale)); - device::transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + MulScaleOp op; + op.scale = static_cast(scale); + gridTransformBinary_< TransformPolicy >(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } } +} + +void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream); + static const func_t funcs[7][7] = + { + { + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl + }, + { + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl + }, + { + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl + }, + { + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl, + mulMatImpl + }, + { + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + mulMatImpl, + mulMatImpl, + mulMatImpl + }, + { + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + mulMatImpl, + mulMatImpl + }, + { + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + 0 /*mulMatImpl*/, + mulMatImpl + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 ); + + GpuMat src1_ = src1.reshape(1); + GpuMat src2_ = src2.reshape(1); + GpuMat dst_ = dst.reshape(1); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, scale, stream); +} + +namespace +{ + template + struct MulOpSpecial : binary_function + { + __device__ __forceinline__ T operator ()(const T& a, float b) const + { + typedef typename VecTraits::elem_type elem_type; + + T res; + + res.x = saturate_cast(a.x * b); + res.y = saturate_cast(a.y * b); + res.z = saturate_cast(a.z * b); + res.w = saturate_cast(a.w * b); + + return res; + } + }; +} - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +{ + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), MulOpSpecial(), stream); +} + +void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +{ + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), MulOpSpecial(), stream); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/cuda/mul_scalar.cu b/modules/cudaarithm/src/cuda/mul_scalar.cu index 341cd9b..4700d30 100644 --- a/modules/cudaarithm/src/cuda/mul_scalar.cu +++ b/modules/cudaarithm/src/cuda/mul_scalar.cu @@ -40,105 +40,143 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/transform.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/simd_functions.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "arithm_func_traits.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace arithm +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int); + +namespace { - template struct MulScalar : unary_function + template struct MulScalarOp : unary_function { - S val; - - __host__ explicit MulScalar(S val_) : val(val_) {} + ScalarType val; - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ DstType operator ()(SrcType a) const { - return saturate_cast(a * val); + return saturate_cast(saturate_cast(a) * val); } }; -} -namespace cv { namespace cuda { namespace device -{ - template struct TransformFunctorTraits< arithm::MulScalar > : arithm::ArithmFuncTraits + template struct TransformPolicy : DefaultTransformPolicy + { + }; + template <> struct TransformPolicy : DefaultTransformPolicy { + enum { + shift = 1 + }; }; -}}} -namespace arithm -{ - template - void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + template + void mulScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream) { - MulScalar op(static_cast(val)); - device::transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + typedef typename MakeVec::cn>::type ScalarType; + + cv::Scalar_ value_ = value; + + MulScalarOp op; + op.val = VecTraits::make(value_.val); + + gridTransformUnary_< TransformPolicy >(globPtr(src), globPtr(dst), op, stream); } +} + +void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int) +{ + typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, Stream& stream); + static const func_t funcs[7][7][4] = + { + { + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + }, + { + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/, 0 /*mulScalarImpl*/}, + {mulScalarImpl, mulScalarImpl, mulScalarImpl, mulScalarImpl} + } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + CV_DbgAssert( sdepth < 7 && ddepth < 7 && cn <= 4 ); + + val[0] *= scale; + val[1] *= scale; + val[2] *= scale; + val[3] *= scale; + + const func_t func = funcs[sdepth][ddepth][cn - 1]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + func(src, val, dst, stream); } -#endif // CUDA_DISABLER +#endif diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index 071c793..d170166 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -360,219 +360,11 @@ void cv::cuda::subtract(InputArray src1, InputArray src2, OutputArray dst, Input //////////////////////////////////////////////////////////////////////// // multiply -namespace arithm -{ - void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); - - void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); - - template - void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); -} - -static void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat - }, - { - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat - }, - { - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat - }, - { - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat - }, - { - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - arithm::mulMat, - arithm::mulMat, - arithm::mulMat - }, - { - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - arithm::mulMat, - arithm::mulMat - }, - { - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - 0 /*arithm::mulMat*/, - arithm::mulMat - } - }; - - const int sdepth = src1.depth(); - const int ddepth = dst.depth(); - const int cn = src1.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); - PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); - PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src1_, src2_, dst_, scale, stream); -} - -namespace arithm -{ - template - void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); -} +void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int); +void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); +void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream); -static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) -{ - typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar - }, - { - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar - }, - { - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar - }, - { - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar - }, - { - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - arithm::mulScalar, - arithm::mulScalar, - arithm::mulScalar - }, - { - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - arithm::mulScalar, - arithm::mulScalar - }, - { - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - 0 /*arithm::mulScalar*/, - arithm::mulScalar - } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[7][4] = - { - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0, 0 , 0 }, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {NppArithmScalar::call, 0, NppArithmScalar::call, 0 }, - {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, - {0 , 0, 0 , 0 } - }; - - const int sdepth = src.depth(); - const int ddepth = dst.depth(); - const int cn = src.channels(); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - val[0] *= scale; - val[1] *= scale; - val[2] *= scale; - val[3] *= scale; - - const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; - if (ddepth == sdepth && cn > 1 && npp_func != 0) - { - npp_func(src, val, dst, stream); - return; - } - - CV_Assert( cn == 1 ); - - const func_t func = funcs[sdepth][ddepth]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - - func(src, val[0], dst, stream); -} +void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int); void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream) { @@ -586,7 +378,7 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); - arithm::mulMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + mulMat_8uc4_32f(src1, src2, dst, stream); } else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) { @@ -598,7 +390,7 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do _dst.create(src1.size(), src1.type()); GpuMat dst = _dst.getGpuMat(); - arithm::mulMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + mulMat_16sc4_32f(src1, src2, dst, stream); } else {