From d569e72ad43ecd31c855f0c0d04b75ebed2ff84f Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 9 Apr 2013 15:49:56 +0400 Subject: [PATCH] moved mulSpectrums, dft and convolve to gpuarithm --- modules/gpu/CMakeLists.txt | 6 - modules/gpu/include/opencv2/gpu.hpp | 43 --- modules/gpu/perf/perf_imgproc.cpp | 150 --------- modules/gpu/src/cuda/imgproc.cu | 113 ------- modules/gpu/src/cuda/safe_call.hpp | 15 - modules/gpu/src/error.cpp | 62 ---- modules/gpu/src/imgproc.cpp | 299 ------------------ modules/gpu/test/test_imgproc.cpp | 272 ---------------- modules/gpuarithm/CMakeLists.txt | 4 + modules/gpuarithm/include/opencv2/gpuarithm.hpp | 43 +++ modules/gpuarithm/perf/perf_core.cpp | 150 +++++++++ modules/gpuarithm/src/arithm.cpp | 393 ++++++++++++++++++++++-- modules/gpuarithm/src/cuda/mul_spectrums.cu | 171 +++++++++++ modules/gpuarithm/src/precomp.hpp | 6 +- modules/gpuarithm/test/test_core.cpp | 272 ++++++++++++++++ 15 files changed, 1010 insertions(+), 989 deletions(-) create mode 100644 modules/gpuarithm/src/cuda/mul_spectrums.cu diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index ecad9e4..55fc100 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -48,12 +48,6 @@ ocv_set_module_sources( ocv_create_module(${cuda_link_libs}) -if(HAVE_CUDA) - if(HAVE_CUFFT) - CUDA_ADD_CUFFT_TO_TARGET(${the_module}) - endif() -endif() - ocv_add_precompiled_headers(${the_module}) ################################################################################################################ diff --git a/modules/gpu/include/opencv2/gpu.hpp b/modules/gpu/include/opencv2/gpu.hpp index ead3ab3..19fd7c9 100644 --- a/modules/gpu/include/opencv2/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu.hpp @@ -180,49 +180,6 @@ CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, Gp CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null()); -//! performs per-element multiplication of two full (not packed) Fourier spectrums -//! supports 32FC2 matrixes only (interleaved format) -CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null()); - -//! performs per-element multiplication of two full (not packed) Fourier spectrums -//! supports 32FC2 matrixes only (interleaved format) -CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null()); - -//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. -//! Param dft_size is the size of DFT transform. -//! -//! If the source matrix is not continous, then additional copy will be done, -//! so to avoid copying ensure the source matrix is continous one. If you want to use -//! preallocated output ensure it is continuous too, otherwise it will be reallocated. -//! -//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values -//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved. -//! -//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. -CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); - -struct CV_EXPORTS ConvolveBuf -{ - Size result_size; - Size block_size; - Size user_block_size; - Size dft_size; - int spect_len; - - GpuMat image_spect, templ_spect, result_spect; - GpuMat image_block, templ_block, result_data; - - void create(Size image_size, Size templ_size); - static Size estimateBlockSize(Size result_size, Size templ_size); -}; - - -//! computes convolution (or cross-correlation) of two images using discrete Fourier transform -//! supports source images of 32FC1 type only -//! result matrix will have 32FC1 type -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); - struct CV_EXPORTS MatchTemplateBuf { Size user_block_size; diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index d26bb84..5f8e9b2 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -718,54 +718,6 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_BlendLinear, } } -////////////////////////////////////////////////////////////////////// -// Convolve - -DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool); - -PERF_TEST_P(Sz_KernelSz_Ccorr, ImgProc_Convolve, - Combine(GPU_TYPICAL_MAT_SIZES, - Values(17, 27, 32, 64), - Bool())) -{ - declare.time(10.0); - - const cv::Size size = GET_PARAM(0); - const int templ_size = GET_PARAM(1); - const bool ccorr = GET_PARAM(2); - - const cv::Mat image(size, CV_32FC1); - const cv::Mat templ(templ_size, templ_size, CV_32FC1); - declare.in(image, templ, WARMUP_RNG); - - if (PERF_RUN_GPU()) - { - cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1); - d_image.upload(image); - - cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1); - d_templ.upload(templ); - - cv::gpu::GpuMat dst; - cv::gpu::ConvolveBuf d_buf; - - TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf); - - GPU_SANITY_CHECK(dst); - } - else - { - if (ccorr) - FAIL_NO_CPU(); - - cv::Mat dst; - - TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ); - - CPU_SANITY_CHECK(dst); - } -} - //////////////////////////////////////////////////////////////////////////////// // MatchTemplate8U @@ -848,108 +800,6 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F, CPU_SANITY_CHECK(dst); } -}; - -////////////////////////////////////////////////////////////////////// -// MulSpectrums - -CV_FLAGS(DftFlags, 0, DFT_INVERSE, DFT_SCALE, DFT_ROWS, DFT_COMPLEX_OUTPUT, DFT_REAL_OUTPUT) - -DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags); - -PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums, - Combine(GPU_TYPICAL_MAT_SIZES, - Values(0, DftFlags(cv::DFT_ROWS)))) -{ - const cv::Size size = GET_PARAM(0); - const int flag = GET_PARAM(1); - - cv::Mat a(size, CV_32FC2); - cv::Mat b(size, CV_32FC2); - declare.in(a, b, WARMUP_RNG); - - if (PERF_RUN_GPU()) - { - const cv::gpu::GpuMat d_a(a); - const cv::gpu::GpuMat d_b(b); - cv::gpu::GpuMat dst; - - TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag); - - GPU_SANITY_CHECK(dst); - } - else - { - cv::Mat dst; - - TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag); - - CPU_SANITY_CHECK(dst); - } -} - -////////////////////////////////////////////////////////////////////// -// MulAndScaleSpectrums - -PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums, - GPU_TYPICAL_MAT_SIZES) -{ - const cv::Size size = GetParam(); - - const float scale = 1.f / size.area(); - - cv::Mat src1(size, CV_32FC2); - cv::Mat src2(size, CV_32FC2); - declare.in(src1,src2, WARMUP_RNG); - - if (PERF_RUN_GPU()) - { - const cv::gpu::GpuMat d_src1(src1); - const cv::gpu::GpuMat d_src2(src2); - cv::gpu::GpuMat dst; - - TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false); - - GPU_SANITY_CHECK(dst); - } - else - { - FAIL_NO_CPU(); - } -} - -////////////////////////////////////////////////////////////////////// -// Dft - -PERF_TEST_P(Sz_Flags, ImgProc_Dft, - Combine(GPU_TYPICAL_MAT_SIZES, - Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE)))) -{ - declare.time(10.0); - - const cv::Size size = GET_PARAM(0); - const int flag = GET_PARAM(1); - - cv::Mat src(size, CV_32FC2); - declare.in(src, WARMUP_RNG); - - if (PERF_RUN_GPU()) - { - const cv::gpu::GpuMat d_src(src); - cv::gpu::GpuMat dst; - - TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag); - - GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); - } - else - { - cv::Mat dst; - - TEST_CYCLE() cv::dft(src, dst, flag); - - CPU_SANITY_CHECK(dst); - } } ////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index f209ab6..01cfae4 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -583,119 +583,6 @@ namespace cv { namespace gpu { namespace cudev } ////////////////////////////////////////////////////////////////////////// - // mulSpectrums - - __global__ void mulSpectrumsKernel(const PtrStep a, const PtrStep b, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < c.cols && y < c.rows) - { - c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); - } - } - - - void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) - { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulSpectrumsKernel<<>>(a, b, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - ////////////////////////////////////////////////////////////////////////// - // mulSpectrums_CONJ - - __global__ void mulSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < c.cols && y < c.rows) - { - c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); - } - } - - - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) - { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulSpectrumsKernel_CONJ<<>>(a, b, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - ////////////////////////////////////////////////////////////////////////// - // mulAndScaleSpectrums - - __global__ void mulAndScaleSpectrumsKernel(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < c.cols && y < c.rows) - { - cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); - c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); - } - } - - - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) - { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - ////////////////////////////////////////////////////////////////////////// - // mulAndScaleSpectrums_CONJ - - __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < c.cols && y < c.rows) - { - cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); - c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); - } - } - - - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) - { - dim3 threads(256); - dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); - - mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - ////////////////////////////////////////////////////////////////////////// // buildWarpMaps // TODO use intrinsics like __sinf and so on diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp index 1d4a437..35530be 100644 --- a/modules/gpu/src/cuda/safe_call.hpp +++ b/modules/gpu/src/cuda/safe_call.hpp @@ -45,21 +45,6 @@ #include -#if defined(__GNUC__) - #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__) -#else /* defined(__CUDACC__) || defined(__MSVC__) */ - #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__) -#endif -namespace cv { namespace gpu -{ - void cufftError(int err, const char *file, const int line, const char *func = ""); -}} - -static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "") -{ - if (CUFFT_SUCCESS != err) - cv::gpu::cufftError(err, file, line, func); -} #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */ diff --git a/modules/gpu/src/error.cpp b/modules/gpu/src/error.cpp index 3b8b6b3..d0a621d 100644 --- a/modules/gpu/src/error.cpp +++ b/modules/gpu/src/error.cpp @@ -43,65 +43,3 @@ using namespace cv; using namespace cv::gpu; - -#ifdef HAVE_CUDA - -namespace -{ - #define error_entry(entry) { entry, #entry } - - struct ErrorEntry - { - int code; - const char* str; - }; - - struct ErrorEntryComparer - { - int code; - ErrorEntryComparer(int code_) : code(code_) {} - bool operator()(const ErrorEntry& e) const { return e.code == code; } - }; - - String getErrorString(int code, const ErrorEntry* errors, size_t n) - { - size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors; - - const char* msg = (idx != n) ? errors[idx].str : "Unknown error code"; - String str = cv::format("%s [Code = %d]", msg, code); - - return str; - } - - ////////////////////////////////////////////////////////////////////////// - // CUFFT errors - - const ErrorEntry cufft_errors[] = - { - error_entry( CUFFT_INVALID_PLAN ), - error_entry( CUFFT_ALLOC_FAILED ), - error_entry( CUFFT_INVALID_TYPE ), - error_entry( CUFFT_INVALID_VALUE ), - error_entry( CUFFT_INTERNAL_ERROR ), - error_entry( CUFFT_EXEC_FAILED ), - error_entry( CUFFT_SETUP_FAILED ), - error_entry( CUFFT_INVALID_SIZE ), - error_entry( CUFFT_UNALIGNED_DATA ) - }; - - const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]); -} - -namespace cv -{ - namespace gpu - { - void cufftError(int code, const char* file, const int line, const char* func) - { - String msg = getErrorString(code, cufft_errors, cufft_error_num); - cv::error(cv::Error::GpuApiCallError, msg, func, file, line); - } - } -} - -#endif diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 3a967fb..c21a7b8 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -73,12 +73,6 @@ void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, in void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_no_cuda(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); } -void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); } -void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); } -void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); } -void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); } -void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); } void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_no_cuda(); } void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_no_cuda(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_no_cuda(); } @@ -848,299 +842,6 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); } -////////////////////////////////////////////////////////////////////////////// -// mulSpectrums - -namespace cv { namespace gpu { namespace cudev -{ - namespace imgproc - { - void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); - - void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); - } -}}} - -void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) -{ - (void)flags; - using namespace ::cv::gpu::cudev::imgproc; - - typedef void (*Caller)(const PtrStep, const PtrStep, PtrStepSz, cudaStream_t stream); - - static Caller callers[] = { cudev::imgproc::mulSpectrums, cudev::imgproc::mulSpectrums_CONJ }; - - CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); - CV_Assert(a.size() == b.size()); - - c.create(a.size(), CV_32FC2); - - Caller caller = callers[(int)conjB]; - caller(a, b, c, StreamAccessor::getStream(stream)); -} - -////////////////////////////////////////////////////////////////////////////// -// mulAndScaleSpectrums - -namespace cv { namespace gpu { namespace cudev -{ - namespace imgproc - { - void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); - - void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); - } -}}} - -void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) -{ - (void)flags; - using namespace ::cv::gpu::cudev::imgproc; - - typedef void (*Caller)(const PtrStep, const PtrStep, float scale, PtrStepSz, cudaStream_t stream); - static Caller callers[] = { cudev::imgproc::mulAndScaleSpectrums, cudev::imgproc::mulAndScaleSpectrums_CONJ }; - - CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); - CV_Assert(a.size() == b.size()); - - c.create(a.size(), CV_32FC2); - - Caller caller = callers[(int)conjB]; - caller(a, b, scale, c, StreamAccessor::getStream(stream)); -} - -////////////////////////////////////////////////////////////////////////////// -// dft - -void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream) -{ -#ifndef HAVE_CUFFT - - OPENCV_GPU_UNUSED(src); - OPENCV_GPU_UNUSED(dst); - OPENCV_GPU_UNUSED(dft_size); - OPENCV_GPU_UNUSED(flags); - OPENCV_GPU_UNUSED(stream); - - throw_no_cuda(); - -#else - - CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2); - - // We don't support unpacked output (in the case of real input) - CV_Assert(!(flags & DFT_COMPLEX_OUTPUT)); - - bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); - int is_row_dft = flags & DFT_ROWS; - int is_scaled_dft = flags & DFT_SCALE; - int is_inverse = flags & DFT_INVERSE; - bool is_complex_input = src.channels() == 2; - bool is_complex_output = !(flags & DFT_REAL_OUTPUT); - - // We don't support real-to-real transform - CV_Assert(is_complex_input || is_complex_output); - - GpuMat src_data; - - // Make sure here we work with the continuous input, - // as CUFFT can't handle gaps - src_data = src; - createContinuous(src.rows, src.cols, src.type(), src_data); - if (src_data.data != src.data) - src.copyTo(src_data); - - Size dft_size_opt = dft_size; - if (is_1d_input && !is_row_dft) - { - // If the source matrix is single column handle it as single row - dft_size_opt.width = std::max(dft_size.width, dft_size.height); - dft_size_opt.height = std::min(dft_size.width, dft_size.height); - } - - cufftType dft_type = CUFFT_R2C; - if (is_complex_input) - dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; - - CV_Assert(dft_size_opt.width > 1); - - cufftHandle plan; - if (is_1d_input || is_row_dft) - cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height); - else - cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type); - - cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); - - if (is_complex_input) - { - if (is_complex_output) - { - createContinuous(dft_size, CV_32FC2, dst); - cufftSafeCall(cufftExecC2C( - plan, src_data.ptr(), dst.ptr(), - is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); - } - else - { - createContinuous(dft_size, CV_32F, dst); - cufftSafeCall(cufftExecC2R( - plan, src_data.ptr(), dst.ptr())); - } - } - else - { - // We could swap dft_size for efficiency. Here we must reflect it - if (dft_size == dft_size_opt) - createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst); - else - createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst); - - cufftSafeCall(cufftExecR2C( - plan, src_data.ptr(), dst.ptr())); - } - - cufftSafeCall(cufftDestroy(plan)); - - if (is_scaled_dft) - multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream); - -#endif -} - -////////////////////////////////////////////////////////////////////////////// -// convolve - -void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size) -{ - result_size = Size(image_size.width - templ_size.width + 1, - image_size.height - templ_size.height + 1); - - block_size = user_block_size; - if (user_block_size.width == 0 || user_block_size.height == 0) - block_size = estimateBlockSize(result_size, templ_size); - - dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); - dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); - - // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192), - // see CUDA Toolkit 4.1 CUFFT Library Programming Guide - if (dft_size.width > 8192) - dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1); - if (dft_size.height > 8192) - dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); - - // To avoid wasting time doing small DFTs - dft_size.width = std::max(dft_size.width, 512); - dft_size.height = std::max(dft_size.height, 512); - - createContinuous(dft_size, CV_32F, image_block); - createContinuous(dft_size, CV_32F, templ_block); - createContinuous(dft_size, CV_32F, result_data); - - spect_len = dft_size.height * (dft_size.width / 2 + 1); - createContinuous(1, spect_len, CV_32FC2, image_spect); - createContinuous(1, spect_len, CV_32FC2, templ_spect); - createContinuous(1, spect_len, CV_32FC2, result_spect); - - // Use maximum result matrix block size for the estimated DFT block size - block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); - block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); -} - - -Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/) -{ - int width = (result_size.width + 2) / 3; - int height = (result_size.height + 2) / 3; - width = std::min(width, result_size.width); - height = std::min(height, result_size.height); - return Size(width, height); -} - - -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr) -{ - ConvolveBuf buf; - convolve(image, templ, result, ccorr, buf); -} - -void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream) -{ - using namespace ::cv::gpu::cudev::imgproc; - -#ifndef HAVE_CUFFT - throw_no_cuda(); -#else - CV_Assert(image.type() == CV_32F); - CV_Assert(templ.type() == CV_32F); - - buf.create(image.size(), templ.size()); - result.create(buf.result_size, CV_32F); - - Size& block_size = buf.block_size; - Size& dft_size = buf.dft_size; - - GpuMat& image_block = buf.image_block; - GpuMat& templ_block = buf.templ_block; - GpuMat& result_data = buf.result_data; - - GpuMat& image_spect = buf.image_spect; - GpuMat& templ_spect = buf.templ_spect; - GpuMat& result_spect = buf.result_spect; - - cufftHandle planR2C, planC2R; - cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); - - cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); - cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); - - GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); - copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, - templ_block.cols - templ_roi.cols, 0, Scalar(), stream); - - cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), - templ_spect.ptr())); - - // Process all blocks of the result matrix - for (int y = 0; y < result.rows; y += block_size.height) - { - for (int x = 0; x < result.cols; x += block_size.width) - { - Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, - std::min(y + dft_size.height, image.rows) - y); - GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), - image.step); - copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, - 0, image_block.cols - image_roi.cols, 0, Scalar(), stream); - - cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), - image_spect.ptr())); - mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, - 1.f / dft_size.area(), ccorr, stream); - cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), - result_data.ptr())); - - Size result_roi_size(std::min(x + block_size.width, result.cols) - x, - std::min(y + block_size.height, result.rows) - y); - GpuMat result_roi(result_roi_size, result.type(), - (void*)(result.ptr(y) + x), result.step); - GpuMat result_block(result_roi_size, result_data.type(), - result_data.ptr(), result_data.step); - - if (stream) - stream.enqueueCopy(result_block, result_roi); - else - result_block.copyTo(result_roi); - } - } - - cufftSafeCall(cufftDestroy(planR2C)); - cufftSafeCall(cufftDestroy(planC2R)); -#endif -} - ////////////////////////////////////////////////////////////////////////////// // Canny diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index ffc413e..6957f54 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -489,92 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, testing::Combine( testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), WHOLE_SUBMAT)); -//////////////////////////////////////////////////////// -// Convolve - -namespace -{ - void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false) - { - // reallocate the output array if needed - C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type()); - cv::Size dftSize; - - // compute the size of DFT transform - dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1); - dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1); - - // allocate temporary buffers and initialize them with 0s - cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0)); - cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0)); - - // copy A and B to the top-left corners of tempA and tempB, respectively - cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows)); - A.copyTo(roiA); - cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows)); - B.copyTo(roiB); - - // now transform the padded A & B in-place; - // use "nonzeroRows" hint for faster processing - cv::dft(tempA, tempA, 0, A.rows); - cv::dft(tempB, tempB, 0, B.rows); - - // multiply the spectrums; - // the function handles packed spectrum representations well - cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr); - - // transform the product back from the frequency domain. - // Even though all the result rows will be non-zero, - // you need only the first C.rows of them, and thus you - // pass nonzeroRows == C.rows - cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows); - - // now copy the result back to C. - tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C); - } - - IMPLEMENT_PARAM_CLASS(KSize, int); - IMPLEMENT_PARAM_CLASS(Ccorr, bool); -} - -PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr) -{ - cv::gpu::DeviceInfo devInfo; - cv::Size size; - int ksize; - bool ccorr; - - virtual void SetUp() - { - devInfo = GET_PARAM(0); - size = GET_PARAM(1); - ksize = GET_PARAM(2); - ccorr = GET_PARAM(3); - - cv::gpu::setDevice(devInfo.deviceID()); - } -}; - -GPU_TEST_P(Convolve, Accuracy) -{ - cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0); - cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0); - - cv::gpu::GpuMat dst; - cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr); - - cv::Mat dst_gold; - convolveDFT(src, kernel, dst_gold, ccorr); - - EXPECT_MAT_NEAR(dst, dst_gold, 1e-1); -} - -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Convolve, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES, - testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)), - testing::Values(Ccorr(false), Ccorr(true)))); - //////////////////////////////////////////////////////////////////////////////// // MatchTemplate8U @@ -830,192 +744,6 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF) INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate_CanFindBigTemplate, ALL_DEVICES); -//////////////////////////////////////////////////////////////////////////// -// MulSpectrums - -CV_FLAGS(DftFlags, 0, DFT_INVERSE, DFT_SCALE, DFT_ROWS, DFT_COMPLEX_OUTPUT, DFT_REAL_OUTPUT) - -PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags) -{ - cv::gpu::DeviceInfo devInfo; - cv::Size size; - int flag; - - cv::Mat a, b; - - virtual void SetUp() - { - devInfo = GET_PARAM(0); - size = GET_PARAM(1); - flag = GET_PARAM(2); - - cv::gpu::setDevice(devInfo.deviceID()); - - a = randomMat(size, CV_32FC2); - b = randomMat(size, CV_32FC2); - } -}; - -GPU_TEST_P(MulSpectrums, Simple) -{ - cv::gpu::GpuMat c; - cv::gpu::mulSpectrums(loadMat(a), loadMat(b), c, flag, false); - - cv::Mat c_gold; - cv::mulSpectrums(a, b, c_gold, flag, false); - - EXPECT_MAT_NEAR(c_gold, c, 1e-2); -} - -GPU_TEST_P(MulSpectrums, Scaled) -{ - float scale = 1.f / size.area(); - - cv::gpu::GpuMat c; - cv::gpu::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false); - - cv::Mat c_gold; - cv::mulSpectrums(a, b, c_gold, flag, false); - c_gold.convertTo(c_gold, c_gold.type(), scale); - - EXPECT_MAT_NEAR(c_gold, c, 1e-2); -} - -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MulSpectrums, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES, - testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS)))); - -//////////////////////////////////////////////////////////////////////////// -// Dft - -struct Dft : testing::TestWithParam -{ - cv::gpu::DeviceInfo devInfo; - - virtual void SetUp() - { - devInfo = GetParam(); - - cv::gpu::setDevice(devInfo.deviceID()); - } -}; - -namespace -{ - void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace) - { - SCOPED_TRACE(hint); - - cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0); - - cv::Mat b_gold; - cv::dft(a, b_gold, flags); - - cv::gpu::GpuMat d_b; - cv::gpu::GpuMat d_b_data; - if (inplace) - { - d_b_data.create(1, a.size().area(), CV_32FC2); - d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize()); - } - cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags); - - EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr()); - ASSERT_EQ(CV_32F, d_b.depth()); - ASSERT_EQ(2, d_b.channels()); - EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4); - } -} - -GPU_TEST_P(Dft, C2C) -{ - int cols = randomInt(2, 100); - int rows = randomInt(2, 100); - - for (int i = 0; i < 2; ++i) - { - bool inplace = i != 0; - - testC2C("no flags", cols, rows, 0, inplace); - testC2C("no flags 0 1", cols, rows + 1, 0, inplace); - testC2C("no flags 1 0", cols, rows + 1, 0, inplace); - testC2C("no flags 1 1", cols + 1, rows, 0, inplace); - testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace); - testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace); - testC2C("single col", 1, rows, 0, inplace); - testC2C("single row", cols, 1, 0, inplace); - testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace); - testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace); - testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace); - testC2C("size 1 2", 1, 2, 0, inplace); - testC2C("size 2 1", 2, 1, 0, inplace); - } -} - -namespace -{ - void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace) - { - SCOPED_TRACE(hint); - - cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0); - - cv::gpu::GpuMat d_b, d_c; - cv::gpu::GpuMat d_b_data, d_c_data; - if (inplace) - { - if (a.cols == 1) - { - d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2); - d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize()); - } - else - { - d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2); - d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize()); - } - d_c_data.create(1, a.size().area(), CV_32F); - d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize()); - } - - cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0); - cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE); - - EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr()); - EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr()); - ASSERT_EQ(CV_32F, d_c.depth()); - ASSERT_EQ(1, d_c.channels()); - - cv::Mat c(d_c); - EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5); - } -} - -GPU_TEST_P(Dft, R2CThenC2R) -{ - int cols = randomInt(2, 100); - int rows = randomInt(2, 100); - - testR2CThenC2R("sanity", cols, rows, false); - testR2CThenC2R("sanity 0 1", cols, rows + 1, false); - testR2CThenC2R("sanity 1 0", cols + 1, rows, false); - testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false); - testR2CThenC2R("single col", 1, rows, false); - testR2CThenC2R("single col 1", 1, rows + 1, false); - testR2CThenC2R("single row", cols, 1, false); - testR2CThenC2R("single row 1", cols + 1, 1, false); - - testR2CThenC2R("sanity", cols, rows, true); - testR2CThenC2R("sanity 0 1", cols, rows + 1, true); - testR2CThenC2R("sanity 1 0", cols + 1, rows, true); - testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true); - testR2CThenC2R("single row", cols, 1, true); - testR2CThenC2R("single row 1", cols + 1, 1, true); -} - -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Dft, ALL_DEVICES); - /////////////////////////////////////////////////////////////////////////////////////////////////////// // CornerHarris diff --git a/modules/gpuarithm/CMakeLists.txt b/modules/gpuarithm/CMakeLists.txt index 75cab4b..4be25dd 100644 --- a/modules/gpuarithm/CMakeLists.txt +++ b/modules/gpuarithm/CMakeLists.txt @@ -11,3 +11,7 @@ ocv_define_module(gpuarithm opencv_core OPTIONAL opencv_gpunvidia opencv_imgproc if(HAVE_CUBLAS) CUDA_ADD_CUBLAS_TO_TARGET(${the_module}) endif() + +if(HAVE_CUFFT) + CUDA_ADD_CUFFT_TO_TARGET(${the_module}) +endif() diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp index 8829e43..f65d2ec 100644 --- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp +++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp @@ -295,6 +295,49 @@ CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, //! supports source images of 8UC1 type only CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null()); +//! performs per-element multiplication of two full (not packed) Fourier spectrums +//! supports 32FC2 matrixes only (interleaved format) +CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null()); + +//! performs per-element multiplication of two full (not packed) Fourier spectrums +//! supports 32FC2 matrixes only (interleaved format) +CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null()); + +//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. +//! Param dft_size is the size of DFT transform. +//! +//! If the source matrix is not continous, then additional copy will be done, +//! so to avoid copying ensure the source matrix is continous one. If you want to use +//! preallocated output ensure it is continuous too, otherwise it will be reallocated. +//! +//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values +//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved. +//! +//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. +CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); + +struct CV_EXPORTS ConvolveBuf +{ + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + int spect_len; + + GpuMat image_spect, templ_spect, result_spect; + GpuMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size, Size templ_size); +}; + + +//! computes convolution (or cross-correlation) of two images using discrete Fourier transform +//! supports source images of 32FC1 type only +//! result matrix will have 32FC1 type +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); + }} // namespace cv { namespace gpu { #endif /* __OPENCV_GPUARITHM_HPP__ */ diff --git a/modules/gpuarithm/perf/perf_core.cpp b/modules/gpuarithm/perf/perf_core.cpp index 603d244..fd388ed 100644 --- a/modules/gpuarithm/perf/perf_core.cpp +++ b/modules/gpuarithm/perf/perf_core.cpp @@ -2156,6 +2156,108 @@ PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, } } +////////////////////////////////////////////////////////////////////// +// MulSpectrums + +CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT) + +DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags); + +PERF_TEST_P(Sz_Flags, ImgProc_MulSpectrums, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS)))) +{ + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat a(size, CV_32FC2); + cv::Mat b(size, CV_32FC2); + declare.in(a, b, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_a(a); + const cv::gpu::GpuMat d_b(b); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag); + + GPU_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// MulAndScaleSpectrums + +PERF_TEST_P(Sz, ImgProc_MulAndScaleSpectrums, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + const float scale = 1.f / size.area(); + + cv::Mat src1(size, CV_32FC2); + cv::Mat src2(size, CV_32FC2); + declare.in(src1,src2, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src1(src1); + const cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false); + + GPU_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} + +////////////////////////////////////////////////////////////////////// +// Dft + +PERF_TEST_P(Sz_Flags, ImgProc_Dft, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE)))) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat src(size, CV_32FC2); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag); + + GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::dft(src, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + #ifdef HAVE_OPENCV_IMGPROC ////////////////////////////////////////////////////////////////////// @@ -2255,4 +2357,52 @@ PERF_TEST_P(Sz, ImgProc_IntegralSqr, } } +////////////////////////////////////////////////////////////////////// +// Convolve + +DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool); + +PERF_TEST_P(Sz_KernelSz_Ccorr, ImgProc_Convolve, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(17, 27, 32, 64), + Bool())) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int templ_size = GET_PARAM(1); + const bool ccorr = GET_PARAM(2); + + const cv::Mat image(size, CV_32FC1); + const cv::Mat templ(templ_size, templ_size, CV_32FC1); + declare.in(image, templ, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1); + d_image.upload(image); + + cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1); + d_templ.upload(templ); + + cv::gpu::GpuMat dst; + cv::gpu::ConvolveBuf d_buf; + + TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf); + + GPU_SANITY_CHECK(dst); + } + else + { + if (ccorr) + FAIL_NO_CPU(); + + cv::Mat dst; + + TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ); + + CPU_SANITY_CHECK(dst); + } +} + #endif diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp index baf598d..59fd2ee 100644 --- a/modules/gpuarithm/src/arithm.cpp +++ b/modules/gpuarithm/src/arithm.cpp @@ -64,14 +64,15 @@ void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, co void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); } +void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); } +void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); } +void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); } +void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); } #else /* !defined (HAVE_CUDA) */ -//////////////////////////////////////////////////////////////////////// -// gemm - -#ifdef HAVE_CUBLAS - namespace { #define error_entry(entry) { entry, #entry } @@ -89,42 +90,93 @@ namespace bool operator()(const ErrorEntry& e) const { return e.code == code; } }; - const ErrorEntry cublas_errors[] = + String getErrorString(int code, const ErrorEntry* errors, size_t n) { - error_entry( CUBLAS_STATUS_SUCCESS ), - error_entry( CUBLAS_STATUS_NOT_INITIALIZED ), - error_entry( CUBLAS_STATUS_ALLOC_FAILED ), - error_entry( CUBLAS_STATUS_INVALID_VALUE ), - error_entry( CUBLAS_STATUS_ARCH_MISMATCH ), - error_entry( CUBLAS_STATUS_MAPPING_ERROR ), - error_entry( CUBLAS_STATUS_EXECUTION_FAILED ), - error_entry( CUBLAS_STATUS_INTERNAL_ERROR ) - }; + size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors; + + const char* msg = (idx != n) ? errors[idx].str : "Unknown error code"; + String str = cv::format("%s [Code = %d]", msg, code); - const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]); + return str; + } +} - static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func) +#ifdef HAVE_CUBLAS + namespace { - if (CUBLAS_STATUS_SUCCESS != err) + const ErrorEntry cublas_errors[] = { - size_t idx = std::find_if(cublas_errors, cublas_errors + cublas_error_num, ErrorEntryComparer(err)) - cublas_errors; + error_entry( CUBLAS_STATUS_SUCCESS ), + error_entry( CUBLAS_STATUS_NOT_INITIALIZED ), + error_entry( CUBLAS_STATUS_ALLOC_FAILED ), + error_entry( CUBLAS_STATUS_INVALID_VALUE ), + error_entry( CUBLAS_STATUS_ARCH_MISMATCH ), + error_entry( CUBLAS_STATUS_MAPPING_ERROR ), + error_entry( CUBLAS_STATUS_EXECUTION_FAILED ), + error_entry( CUBLAS_STATUS_INTERNAL_ERROR ) + }; - const char* msg = (idx != cublas_error_num) ? cublas_errors[idx].str : "Unknown error code"; - String str = cv::format("%s [Code = %d]", msg, err); + const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]); - cv::error(cv::Error::GpuApiCallError, str, func, file, line); + static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func) + { + if (CUBLAS_STATUS_SUCCESS != err) + { + String msg = getErrorString(err, cublas_errors, cublas_error_num); + cv::error(cv::Error::GpuApiCallError, msg, func, file, line); + } } } -} -#if defined(__GNUC__) - #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, __func__) -#else /* defined(__CUDACC__) || defined(__MSVC__) */ - #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, "") -#endif + #if defined(__GNUC__) + #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, __func__) + #else /* defined(__CUDACC__) || defined(__MSVC__) */ + #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, "") + #endif +#endif // HAVE_CUBLAS + +#ifdef HAVE_CUFFT + namespace + { + ////////////////////////////////////////////////////////////////////////// + // CUFFT errors + + const ErrorEntry cufft_errors[] = + { + error_entry( CUFFT_INVALID_PLAN ), + error_entry( CUFFT_ALLOC_FAILED ), + error_entry( CUFFT_INVALID_TYPE ), + error_entry( CUFFT_INVALID_VALUE ), + error_entry( CUFFT_INTERNAL_ERROR ), + error_entry( CUFFT_EXEC_FAILED ), + error_entry( CUFFT_SETUP_FAILED ), + error_entry( CUFFT_INVALID_SIZE ), + error_entry( CUFFT_UNALIGNED_DATA ) + }; + + const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]); + + void ___cufftSafeCall(int err, const char* file, const int line, const char* func) + { + if (CUFFT_SUCCESS != err) + { + String msg = getErrorString(err, cufft_errors, cufft_error_num); + cv::error(cv::Error::GpuApiCallError, msg, func, file, line); + } + } + } + + #if defined(__GNUC__) + #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__) + #else /* defined(__CUDACC__) || defined(__MSVC__) */ + #define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, "") + #endif #endif +//////////////////////////////////////////////////////////////////////// +// gemm + void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS @@ -836,4 +888,289 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s) #endif } +////////////////////////////////////////////////////////////////////////////// +// mulSpectrums + +namespace cv { namespace gpu { namespace cudev +{ + void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); + + void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream); +}}} + +void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) +{ + (void)flags; + + typedef void (*Caller)(const PtrStep, const PtrStep, PtrStepSz, cudaStream_t stream); + + static Caller callers[] = { cudev::mulSpectrums, cudev::mulSpectrums_CONJ }; + + CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); + CV_Assert(a.size() == b.size()); + + c.create(a.size(), CV_32FC2); + + Caller caller = callers[(int)conjB]; + caller(a, b, c, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// mulAndScaleSpectrums + +namespace cv { namespace gpu { namespace cudev +{ + void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); + + void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream); +}}} + +void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) +{ + (void)flags; + + typedef void (*Caller)(const PtrStep, const PtrStep, float scale, PtrStepSz, cudaStream_t stream); + static Caller callers[] = { cudev::mulAndScaleSpectrums, cudev::mulAndScaleSpectrums_CONJ }; + + CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); + CV_Assert(a.size() == b.size()); + + c.create(a.size(), CV_32FC2); + + Caller caller = callers[(int)conjB]; + caller(a, b, scale, c, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// dft + +void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream) +{ +#ifndef HAVE_CUFFT + + OPENCV_GPU_UNUSED(src); + OPENCV_GPU_UNUSED(dst); + OPENCV_GPU_UNUSED(dft_size); + OPENCV_GPU_UNUSED(flags); + OPENCV_GPU_UNUSED(stream); + + throw_no_cuda(); + +#else + + CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2); + + // We don't support unpacked output (in the case of real input) + CV_Assert(!(flags & DFT_COMPLEX_OUTPUT)); + + bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); + int is_row_dft = flags & DFT_ROWS; + int is_scaled_dft = flags & DFT_SCALE; + int is_inverse = flags & DFT_INVERSE; + bool is_complex_input = src.channels() == 2; + bool is_complex_output = !(flags & DFT_REAL_OUTPUT); + + // We don't support real-to-real transform + CV_Assert(is_complex_input || is_complex_output); + + GpuMat src_data; + + // Make sure here we work with the continuous input, + // as CUFFT can't handle gaps + src_data = src; + createContinuous(src.rows, src.cols, src.type(), src_data); + if (src_data.data != src.data) + src.copyTo(src_data); + + Size dft_size_opt = dft_size; + if (is_1d_input && !is_row_dft) + { + // If the source matrix is single column handle it as single row + dft_size_opt.width = std::max(dft_size.width, dft_size.height); + dft_size_opt.height = std::min(dft_size.width, dft_size.height); + } + + cufftType dft_type = CUFFT_R2C; + if (is_complex_input) + dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; + + CV_Assert(dft_size_opt.width > 1); + + cufftHandle plan; + if (is_1d_input || is_row_dft) + cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height); + else + cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type); + + cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); + + if (is_complex_input) + { + if (is_complex_output) + { + createContinuous(dft_size, CV_32FC2, dst); + cufftSafeCall(cufftExecC2C( + plan, src_data.ptr(), dst.ptr(), + is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); + } + else + { + createContinuous(dft_size, CV_32F, dst); + cufftSafeCall(cufftExecC2R( + plan, src_data.ptr(), dst.ptr())); + } + } + else + { + // We could swap dft_size for efficiency. Here we must reflect it + if (dft_size == dft_size_opt) + createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst); + else + createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst); + + cufftSafeCall(cufftExecR2C( + plan, src_data.ptr(), dst.ptr())); + } + + cufftSafeCall(cufftDestroy(plan)); + + if (is_scaled_dft) + multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream); + +#endif +} + +////////////////////////////////////////////////////////////////////////////// +// convolve + +void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size) +{ + result_size = Size(image_size.width - templ_size.width + 1, + image_size.height - templ_size.height + 1); + + block_size = user_block_size; + if (user_block_size.width == 0 || user_block_size.height == 0) + block_size = estimateBlockSize(result_size, templ_size); + + dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); + dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); + + // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192), + // see CUDA Toolkit 4.1 CUFFT Library Programming Guide + if (dft_size.width > 8192) + dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1); + if (dft_size.height > 8192) + dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); + + // To avoid wasting time doing small DFTs + dft_size.width = std::max(dft_size.width, 512); + dft_size.height = std::max(dft_size.height, 512); + + createContinuous(dft_size, CV_32F, image_block); + createContinuous(dft_size, CV_32F, templ_block); + createContinuous(dft_size, CV_32F, result_data); + + spect_len = dft_size.height * (dft_size.width / 2 + 1); + createContinuous(1, spect_len, CV_32FC2, image_spect); + createContinuous(1, spect_len, CV_32FC2, templ_spect); + createContinuous(1, spect_len, CV_32FC2, result_spect); + + // Use maximum result matrix block size for the estimated DFT block size + block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); + block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); +} + + +Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/) +{ + int width = (result_size.width + 2) / 3; + int height = (result_size.height + 2) / 3; + width = std::min(width, result_size.width); + height = std::min(height, result_size.height); + return Size(width, height); +} + + +void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr) +{ + ConvolveBuf buf; + convolve(image, templ, result, ccorr, buf); +} + +void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream) +{ + using namespace ::cv::gpu::cudev::imgproc; + +#ifndef HAVE_CUFFT + throw_no_cuda(); +#else + CV_Assert(image.type() == CV_32F); + CV_Assert(templ.type() == CV_32F); + + buf.create(image.size(), templ.size()); + result.create(buf.result_size, CV_32F); + + Size& block_size = buf.block_size; + Size& dft_size = buf.dft_size; + + GpuMat& image_block = buf.image_block; + GpuMat& templ_block = buf.templ_block; + GpuMat& result_data = buf.result_data; + + GpuMat& image_spect = buf.image_spect; + GpuMat& templ_spect = buf.templ_spect; + GpuMat& result_spect = buf.result_spect; + + cufftHandle planR2C, planC2R; + cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); + cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + + cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); + cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); + + GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); + copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, 0, Scalar(), stream); + + cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), + templ_spect.ptr())); + + // Process all blocks of the result matrix + for (int y = 0; y < result.rows; y += block_size.height) + { + for (int x = 0; x < result.cols; x += block_size.width) + { + Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, + std::min(y + dft_size.height, image.rows) - y); + GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), + image.step); + copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, + 0, image_block.cols - image_roi.cols, 0, Scalar(), stream); + + cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), + image_spect.ptr())); + mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, + 1.f / dft_size.area(), ccorr, stream); + cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), + result_data.ptr())); + + Size result_roi_size(std::min(x + block_size.width, result.cols) - x, + std::min(y + block_size.height, result.rows) - y); + GpuMat result_roi(result_roi_size, result.type(), + (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, result_data.type(), + result_data.ptr(), result_data.step); + + if (stream) + stream.enqueueCopy(result_block, result_roi); + else + result_block.copyTo(result_roi); + } + } + + cufftSafeCall(cufftDestroy(planR2C)); + cufftSafeCall(cufftDestroy(planC2R)); +#endif +} + #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpuarithm/src/cuda/mul_spectrums.cu b/modules/gpuarithm/src/cuda/mul_spectrums.cu new file mode 100644 index 0000000..1b58b8c --- /dev/null +++ b/modules/gpuarithm/src/cuda/mul_spectrums.cu @@ -0,0 +1,171 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "cvconfig.h" + +#ifdef HAVE_CUFFT + +#include + +#include "opencv2/core/cuda/common.hpp" + +namespace cv { namespace gpu { namespace cudev +{ + ////////////////////////////////////////////////////////////////////////// + // mulSpectrums + + __global__ void mulSpectrumsKernel(const PtrStep a, const PtrStep b, PtrStepSz c) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < c.cols && y < c.rows) + { + c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); + } + } + + + void mulSpectrums(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) + { + dim3 threads(256); + dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); + + mulSpectrumsKernel<<>>(a, b, c); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + + ////////////////////////////////////////////////////////////////////////// + // mulSpectrums_CONJ + + __global__ void mulSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < c.cols && y < c.rows) + { + c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); + } + } + + + void mulSpectrums_CONJ(const PtrStep a, const PtrStep b, PtrStepSz c, cudaStream_t stream) + { + dim3 threads(256); + dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); + + mulSpectrumsKernel_CONJ<<>>(a, b, c); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + + ////////////////////////////////////////////////////////////////////////// + // mulAndScaleSpectrums + + __global__ void mulAndScaleSpectrumsKernel(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < c.cols && y < c.rows) + { + cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); + c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); + } + } + + + void mulAndScaleSpectrums(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) + { + dim3 threads(256); + dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); + + mulAndScaleSpectrumsKernel<<>>(a, b, scale, c); + cudaSafeCall( cudaGetLastError() ); + + if (stream) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + + ////////////////////////////////////////////////////////////////////////// + // mulAndScaleSpectrums_CONJ + + __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < c.cols && y < c.rows) + { + cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); + c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); + } + } + + + void mulAndScaleSpectrums_CONJ(const PtrStep a, const PtrStep b, float scale, PtrStepSz c, cudaStream_t stream) + { + dim3 threads(256); + dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); + + mulAndScaleSpectrumsKernel_CONJ<<>>(a, b, scale, c); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +}}} // namespace cv { namespace gpu { namespace cudev + +#endif // HAVE_CUFFT + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpuarithm/src/precomp.hpp b/modules/gpuarithm/src/precomp.hpp index 6e21684..f8e38e8 100644 --- a/modules/gpuarithm/src/precomp.hpp +++ b/modules/gpuarithm/src/precomp.hpp @@ -59,7 +59,11 @@ #endif #ifdef HAVE_CUBLAS - #include +# include +#endif + +#ifdef HAVE_CUFFT +# include #endif #endif /* __OPENCV_PRECOMP_H__ */ diff --git a/modules/gpuarithm/test/test_core.cpp b/modules/gpuarithm/test/test_core.cpp index 36c1554..dd8f854 100644 --- a/modules/gpuarithm/test/test_core.cpp +++ b/modules/gpuarithm/test/test_core.cpp @@ -3607,6 +3607,278 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Normalize, testing::Combine( testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)), WHOLE_SUBMAT)); +//////////////////////////////////////////////////////////////////////////// +// MulSpectrums + +CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT) + +PARAM_TEST_CASE(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags) +{ + cv::gpu::DeviceInfo devInfo; + cv::Size size; + int flag; + + cv::Mat a, b; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + flag = GET_PARAM(2); + + cv::gpu::setDevice(devInfo.deviceID()); + + a = randomMat(size, CV_32FC2); + b = randomMat(size, CV_32FC2); + } +}; + +GPU_TEST_P(MulSpectrums, Simple) +{ + cv::gpu::GpuMat c; + cv::gpu::mulSpectrums(loadMat(a), loadMat(b), c, flag, false); + + cv::Mat c_gold; + cv::mulSpectrums(a, b, c_gold, flag, false); + + EXPECT_MAT_NEAR(c_gold, c, 1e-2); +} + +GPU_TEST_P(MulSpectrums, Scaled) +{ + float scale = 1.f / size.area(); + + cv::gpu::GpuMat c; + cv::gpu::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false); + + cv::Mat c_gold; + cv::mulSpectrums(a, b, c_gold, flag, false); + c_gold.convertTo(c_gold, c_gold.type(), scale); + + EXPECT_MAT_NEAR(c_gold, c, 1e-2); +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MulSpectrums, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS)))); + +//////////////////////////////////////////////////////////////////////////// +// Dft + +struct Dft : testing::TestWithParam +{ + cv::gpu::DeviceInfo devInfo; + + virtual void SetUp() + { + devInfo = GetParam(); + + cv::gpu::setDevice(devInfo.deviceID()); + } +}; + +namespace +{ + void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace) + { + SCOPED_TRACE(hint); + + cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0); + + cv::Mat b_gold; + cv::dft(a, b_gold, flags); + + cv::gpu::GpuMat d_b; + cv::gpu::GpuMat d_b_data; + if (inplace) + { + d_b_data.create(1, a.size().area(), CV_32FC2); + d_b = cv::gpu::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize()); + } + cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), flags); + + EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr()); + ASSERT_EQ(CV_32F, d_b.depth()); + ASSERT_EQ(2, d_b.channels()); + EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4); + } +} + +GPU_TEST_P(Dft, C2C) +{ + int cols = randomInt(2, 100); + int rows = randomInt(2, 100); + + for (int i = 0; i < 2; ++i) + { + bool inplace = i != 0; + + testC2C("no flags", cols, rows, 0, inplace); + testC2C("no flags 0 1", cols, rows + 1, 0, inplace); + testC2C("no flags 1 0", cols, rows + 1, 0, inplace); + testC2C("no flags 1 1", cols + 1, rows, 0, inplace); + testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace); + testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace); + testC2C("single col", 1, rows, 0, inplace); + testC2C("single row", cols, 1, 0, inplace); + testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace); + testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace); + testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace); + testC2C("size 1 2", 1, 2, 0, inplace); + testC2C("size 2 1", 2, 1, 0, inplace); + } +} + +namespace +{ + void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace) + { + SCOPED_TRACE(hint); + + cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0); + + cv::gpu::GpuMat d_b, d_c; + cv::gpu::GpuMat d_b_data, d_c_data; + if (inplace) + { + if (a.cols == 1) + { + d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2); + d_b = cv::gpu::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize()); + } + else + { + d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2); + d_b = cv::gpu::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize()); + } + d_c_data.create(1, a.size().area(), CV_32F); + d_c = cv::gpu::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize()); + } + + cv::gpu::dft(loadMat(a), d_b, cv::Size(cols, rows), 0); + cv::gpu::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE); + + EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr()); + EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr()); + ASSERT_EQ(CV_32F, d_c.depth()); + ASSERT_EQ(1, d_c.channels()); + + cv::Mat c(d_c); + EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5); + } +} + +GPU_TEST_P(Dft, R2CThenC2R) +{ + int cols = randomInt(2, 100); + int rows = randomInt(2, 100); + + testR2CThenC2R("sanity", cols, rows, false); + testR2CThenC2R("sanity 0 1", cols, rows + 1, false); + testR2CThenC2R("sanity 1 0", cols + 1, rows, false); + testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false); + testR2CThenC2R("single col", 1, rows, false); + testR2CThenC2R("single col 1", 1, rows + 1, false); + testR2CThenC2R("single row", cols, 1, false); + testR2CThenC2R("single row 1", cols + 1, 1, false); + + testR2CThenC2R("sanity", cols, rows, true); + testR2CThenC2R("sanity 0 1", cols, rows + 1, true); + testR2CThenC2R("sanity 1 0", cols + 1, rows, true); + testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true); + testR2CThenC2R("single row", cols, 1, true); + testR2CThenC2R("single row 1", cols + 1, 1, true); +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Dft, ALL_DEVICES); + +//////////////////////////////////////////////////////// +// Convolve + +namespace +{ + void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false) + { + // reallocate the output array if needed + C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type()); + cv::Size dftSize; + + // compute the size of DFT transform + dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1); + dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1); + + // allocate temporary buffers and initialize them with 0s + cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0)); + cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0)); + + // copy A and B to the top-left corners of tempA and tempB, respectively + cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows)); + A.copyTo(roiA); + cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows)); + B.copyTo(roiB); + + // now transform the padded A & B in-place; + // use "nonzeroRows" hint for faster processing + cv::dft(tempA, tempA, 0, A.rows); + cv::dft(tempB, tempB, 0, B.rows); + + // multiply the spectrums; + // the function handles packed spectrum representations well + cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr); + + // transform the product back from the frequency domain. + // Even though all the result rows will be non-zero, + // you need only the first C.rows of them, and thus you + // pass nonzeroRows == C.rows + cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows); + + // now copy the result back to C. + tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C); + } + + IMPLEMENT_PARAM_CLASS(KSize, int) + IMPLEMENT_PARAM_CLASS(Ccorr, bool) +} + +PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr) +{ + cv::gpu::DeviceInfo devInfo; + cv::Size size; + int ksize; + bool ccorr; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + ksize = GET_PARAM(2); + ccorr = GET_PARAM(3); + + cv::gpu::setDevice(devInfo.deviceID()); + } +}; + +GPU_TEST_P(Convolve, Accuracy) +{ + cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0); + cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0); + + cv::gpu::GpuMat dst; + cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr); + + cv::Mat dst_gold; + convolveDFT(src, kernel, dst_gold, ccorr); + + EXPECT_MAT_NEAR(dst, dst_gold, 1e-1); +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Convolve, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)), + testing::Values(Ccorr(false), Ccorr(true)))); + #ifdef HAVE_OPENCV_IMGPROC ////////////////////////////////////////////////////////////////////////////// -- 2.7.4