From e820c5c65fd225a1d642d9fe44bc6c16ecb56afb Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 26 Aug 2013 10:50:04 +0400 Subject: [PATCH] used new device layer for cv::gpu::mulSpectrums --- modules/cudaarithm/src/arithm.cpp | 89 -------------- modules/cudaarithm/src/cuda/mul_spectrums.cu | 174 +++++++++++++-------------- 2 files changed, 84 insertions(+), 179 deletions(-) diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp index 6f7417a..4fe38e9 100644 --- a/modules/cudaarithm/src/arithm.cpp +++ b/modules/cudaarithm/src/arithm.cpp @@ -293,95 +293,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray } ////////////////////////////////////////////////////////////////////////////// -// mulSpectrums - -#ifdef HAVE_CUFFT - -namespace cv { namespace cuda { namespace device -{ - void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); - - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); -}}} - -#endif - -void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream) -{ -#ifndef HAVE_CUFFT - (void) _src1; - (void) _src2; - (void) _dst; - (void) flags; - (void) conjB; - (void) stream; - throw_no_cuda(); -#else - (void) flags; - - typedef void (*Caller)(const PtrStep, const PtrStep, PtrStepSz, cudaStream_t stream); - static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ }; - - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); - - CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 ); - CV_Assert( src1.size() == src2.size() ); - - _dst.create(src1.size(), CV_32FC2); - GpuMat dst = _dst.getGpuMat(); - - Caller caller = callers[(int)conjB]; - caller(src1, src2, dst, StreamAccessor::getStream(stream)); -#endif -} - -////////////////////////////////////////////////////////////////////////////// -// mulAndScaleSpectrums - -#ifdef HAVE_CUFFT - -namespace cv { namespace cuda { namespace device -{ - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); - - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); -}}} - -#endif - -void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream) -{ -#ifndef HAVE_CUFFT - (void) _src1; - (void) _src2; - (void) _dst; - (void) flags; - (void) scale; - (void) conjB; - (void) stream; - throw_no_cuda(); -#else - (void)flags; - - typedef void (*Caller)(const PtrStep, const PtrStep, float scale, PtrStepSz, cudaStream_t stream); - static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ }; - - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); - - CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2); - CV_Assert( src1.size() == src2.size() ); - - _dst.create(src1.size(), CV_32FC2); - GpuMat dst = _dst.getGpuMat(); - - Caller caller = callers[(int)conjB]; - caller(src1, src2, scale, dst, StreamAccessor::getStream(stream)); -#endif -} - -////////////////////////////////////////////////////////////////////////////// // dft void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) diff --git a/modules/cudaarithm/src/cuda/mul_spectrums.cu b/modules/cudaarithm/src/cuda/mul_spectrums.cu index 42b766d..b060904 100644 --- a/modules/cudaarithm/src/cuda/mul_spectrums.cu +++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu @@ -40,132 +40,126 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "cvconfig.h" +#ifndef HAVE_OPENCV_CUDEV -#ifdef HAVE_CUFFT +#error "opencv_cudev is required" -#include +#else -#include "opencv2/core/cuda/common.hpp" +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" -namespace cv { namespace cuda { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // mulSpectrums +using namespace cv::cudev; - __global__ void mulSpectrumsKernel(const PtrStep a, const PtrStep b, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; +////////////////////////////////////////////////////////////////////////////// +// mulSpectrums - if (x < c.cols && y < c.rows) - { - c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); - } +namespace +{ + __device__ __forceinline__ float real(const float2& val) + { + return val.x; } - - void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) + __device__ __forceinline__ float imag(const float2& val) { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulSpectrumsKernel<<>>(a, b, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + return val.y; } + __device__ __forceinline__ float2 cmul(const float2& a, const float2& b) + { + return make_float2((real(a) * real(b)) - (imag(a) * imag(b)), + (real(a) * imag(b)) + (imag(a) * real(b))); + } - ////////////////////////////////////////////////////////////////////////// - // mulSpectrums_CONJ - - __global__ void mulSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c) + __device__ __forceinline__ float2 conj(const float2& a) { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + return make_float2(real(a), -imag(a)); + } - if (x < c.cols && y < c.rows) + struct comlex_mul : binary_function + { + __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const { - c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); + return cmul(a, b); } - } + }; - - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) + struct comlex_mul_conj : binary_function { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulSpectrumsKernel_CONJ<<>>(a, b, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const + { + return cmul(a, conj(b)); + } + }; + struct comlex_mul_scale : binary_function + { + float scale; - ////////////////////////////////////////////////////////////////////////// - // mulAndScaleSpectrums + __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const + { + return scale * cmul(a, b); + } + }; - __global__ void mulAndScaleSpectrumsKernel(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) + struct comlex_mul_conj_scale : binary_function { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + float scale; - if (x < c.cols && y < c.rows) + __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const { - cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); - c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); + return scale * cmul(a, conj(b)); } - } + }; +} +void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream) +{ + (void) flags; - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) - { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); - mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); - cudaSafeCall( cudaGetLastError() ); + CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 ); + CV_Assert( src1.size() == src2.size() ); - if (stream) - cudaSafeCall( cudaDeviceSynchronize() ); - } + _dst.create(src1.size(), CV_32FC2); + GpuMat dst = _dst.getGpuMat(); + if (conjB) + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), comlex_mul_conj(), stream); + else + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), comlex_mul(), stream); +} - ////////////////////////////////////////////////////////////////////////// - // mulAndScaleSpectrums_CONJ +void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream) +{ + (void) flags; - __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); - if (x < c.cols && y < c.rows) - { - cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); - c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); - } - } + CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2); + CV_Assert( src1.size() == src2.size() ); + _dst.create(src1.size(), CV_32FC2); + GpuMat dst = _dst.getGpuMat(); - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) + if (conjB) { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + comlex_mul_conj_scale op; + op.scale = scale; + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } -}}} // namespace cv { namespace cuda { namespace cudev - -#endif // HAVE_CUFFT + else + { + comlex_mul_scale op; + op.scale = scale; + gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); + } +} -#endif /* CUDA_DISABLER */ +#endif -- 2.7.4