//////////////////////////////////////////////////////////////////////////\r
// multiply\r
\r
- // TODO implement more efficient version\r
- template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
- void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,\r
- PtrStep dst)\r
+ struct multiply_8uc4_32f : binary_function<uint, float, uint>\r
{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
- if (x < cols && y < rows)\r
+ __device__ __forceinline__ uint operator ()(uint a, float b) const\r
{\r
- ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);\r
- }\r
- }\r
+ uint res = 0;\r
\r
+ res |= 0xffu & (saturate_cast<uchar>((0xffu & (a )) * b) );\r
+ res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);\r
+ res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);\r
+ res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);\r
\r
- template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
- void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)\r
- {\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+ return res;\r
+ }\r
+ };\r
\r
- multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);\r
- cudaSafeCall(cudaGetLastError());\r
+ template <> struct TransformFunctorTraits<multiply_8uc4_32f> : DefaultTransformFunctorTraits<multiply_8uc4_32f>\r
+ {\r
+ enum { smart_block_dim_x = 8 };\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
+ void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+ {\r
+ transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);\r
}\r
\r
-\r
- template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
-\r
-\r
//////////////////////////////////////////////////////////////////////////\r
// multiply (by scalar)\r
\r
- // TODO implement more efficient version\r
- template <typename TSrc, typename TDst>\r
- void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)\r
+ template <typename T, typename D, typename S> struct MultiplyScalar : unary_function<T, D>\r
{\r
- int x = blockIdx.x * blockDim.x + threadIdx.x;\r
- int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+ __host__ __device__ __forceinline__ MultiplyScalar(typename TypeTraits<S>::ParameterType scale_) : scale(scale_) {}\r
\r
- if (x < cols && y < rows)\r
+ __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType a) const\r
{\r
- ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x] * scale);\r
+ return saturate_cast<D>(a * scale);\r
}\r
- }\r
\r
+ const S scale;\r
+ };\r
\r
- template <typename TSrc, typename TDst>\r
- void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)\r
+ template <> struct TransformFunctorTraits< MultiplyScalar<uchar, uchar, float> > : DefaultTransformFunctorTraits< MultiplyScalar<uchar, uchar, float> >\r
{\r
- dim3 threads(32, 8);\r
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
-\r
- multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);\r
- cudaSafeCall(cudaGetLastError());\r
+ enum { smart_block_dim_y = 8 };\r
+ enum { smart_shift = 8 };\r
+ };\r
\r
- if (stream == 0)\r
- cudaSafeCall(cudaDeviceSynchronize());\r
+ template <typename T, typename D>\r
+ void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream)\r
+ {\r
+ transform(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), MultiplyScalar<T, D, float>(scale), stream);\r
}\r
\r
-\r
- template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+ template void multiplyScalar_gpu<uchar, uchar>(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);\r
}}}\r
\r
namespace cv { namespace gpu { namespace device\r
{\r
- template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
- void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+ void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
\r
- template <typename TSrc, typename TDst>\r
- void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+ template <typename T, typename D>\r
+ void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);\r
}}}\r
\r
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)\r
{\r
- if (src1.type() == CV_8UC4 && src2.type() == CV_32F)\r
+ if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)\r
{\r
CV_Assert(src1.size() == src2.size());\r
+\r
dst.create(src1.size(), src1.type());\r
- device::multiplyCaller<uchar, float, uchar, 4>(static_cast<DevMem2D>(src1), static_cast<DevMem2D>(src2),\r
- src1.rows, src1.cols * 4, static_cast<DevMem2D>(dst),\r
- StreamAccessor::getStream(stream));\r
+\r
+ device::multiply_gpu(src1, src2, dst, StreamAccessor::getStream(stream));\r
}\r
else\r
nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));\r
if (src.depth() == CV_8U)\r
{\r
dst.create(src.size(), src.type());\r
- device::multiplyScalarCaller<uchar, uchar>(static_cast<DevMem2D>(src), (float)(sc[0]), src.rows, src.cols * src.channels(),\r
- static_cast<DevMem2D>(dst), StreamAccessor::getStream(stream));\r
+\r
+ device::multiplyScalar_gpu<uchar, uchar>(src.reshape(1), (float)(sc[0]), dst, StreamAccessor::getStream(stream));\r
}\r
else\r
{\r