template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);\r
template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);\r
template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);\r
+\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // multiply\r
+\r
+ template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
+ void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,\r
+ PtrStep dst)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows)\r
+ {\r
+ ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);\r
+ }\r
+ }\r
+\r
+\r
+ template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
+ void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+\r
+ multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);\r
+ cudaSafeCall(cudaGetLastError());\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+\r
+\r
+ //////////////////////////////////////////////////////////////////////////\r
+ // multiply (by scalar)\r
+\r
+ template <typename TSrc, typename TDst>\r
+ void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)\r
+ {\r
+ int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < cols && y < rows)\r
+ {\r
+ ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x] * scale);\r
+ }\r
+ }\r
+\r
+\r
+ template <typename TSrc, typename TDst>\r
+ void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)\r
+ {\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+\r
+ multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);\r
+ cudaSafeCall(cudaGetLastError());\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall(cudaDeviceSynchronize());\r
+ }\r
+\r
+\r
+ template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
}}}\r
nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));\r
}\r
\r
+namespace cv { namespace gpu { namespace device\r
+{\r
+ template <typename TSrc1, typename TSrc2, typename TDst, int cn>\r
+ void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+\r
+ template <typename TSrc, typename TDst>\r
+ void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream);\r
+}}}\r
+\r
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)\r
{\r
- nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));\r
+ if (src1.type() == CV_8UC4 && src2.type() == CV_32F)\r
+ {\r
+ CV_Assert(src1.size() == src2.size());\r
+ dst.create(src1.size(), src1.type());\r
+ device::multiplyCaller<uchar, float, uchar, 4>(static_cast<DevMem2D>(src1), static_cast<DevMem2D>(src2),\r
+ src1.rows, src1.cols * 4, static_cast<DevMem2D>(dst),\r
+ StreamAccessor::getStream(stream));\r
+ }\r
+ else\r
+ nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));\r
}\r
\r
+void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)\r
+{\r
+ if (src.depth() == CV_8U)\r
+ {\r
+ dst.create(src.size(), src.type());\r
+ device::multiplyScalarCaller<uchar, uchar>(static_cast<DevMem2D>(src), (float)(sc[0]), src.rows, src.cols * src.channels(),\r
+ static_cast<DevMem2D>(dst), StreamAccessor::getStream(stream));\r
+ }\r
+ else\r
+ {\r
+ CV_Assert(src.type() == CV_32FC1);\r
+\r
+ dst.create(src.size(), src.type());\r
+\r
+ NppiSize sz;\r
+ sz.width = src.cols;\r
+ sz.height = src.rows;\r
+\r
+ cudaStream_t cudaStream = StreamAccessor::getStream(stream);\r
+\r
+ NppStreamHandler h(cudaStream);\r
+\r
+ nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );\r
+\r
+ if (cudaStream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+}\r
+\r
+\r
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)\r
{\r
nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, StreamAccessor::getStream(stream));\r
callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));\r
}\r
\r
-void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)\r
-{\r
- CV_Assert(src.type() == CV_32FC1);\r
-\r
- dst.create(src.size(), src.type());\r
-\r
- NppiSize sz;\r
- sz.width = src.cols;\r
- sz.height = src.rows;\r
-\r
- cudaStream_t cudaStream = StreamAccessor::getStream(stream);\r
-\r
- NppStreamHandler h(cudaStream);\r
-\r
- nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );\r
-\r
- if (cudaStream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
-}\r
-\r
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)\r
{\r
CV_Assert(src.type() == CV_32FC1);\r