From 21b081deff156578183f94b7ba3b94777d877fd2 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Fri, 24 Dec 2010 09:26:19 +0000 Subject: [PATCH] now single row GPU matrix is continuous one, added aux. functions, updated dft and matchTemplates --- modules/gpu/include/opencv2/gpu/gpu.hpp | 3 + .../gpu/include/opencv2/gpu/matrix_operations.hpp | 20 ++++ modules/gpu/src/imgproc_gpu.cpp | 133 +++++---------------- modules/gpu/src/matrix_operations.cpp | 14 +++ tests/gpu/src/dft_routines.cpp | 3 +- tests/gpu/src/gputest_main.cpp | 1 + 6 files changed, 72 insertions(+), 102 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index c5d1df3..6c11c19 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -246,6 +246,9 @@ namespace cv #include "GpuMat_BetaDeprecated.hpp" #endif + //! creates continuous GPU matrix + CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); + //////////////////////////////// CudaMem //////////////////////////////// // CudaMem is limited cv::Mat with page locked memory allocation. // Page locked memory is only needed for async and faster coping to GPU. diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp index 5d4f06a..569eb9a 100644 --- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp +++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp @@ -345,6 +345,26 @@ inline GpuMat GpuMat::t() const static inline void swap( GpuMat& a, GpuMat& b ) { a.swap(b); } +inline GpuMat createContinuous(int rows, int cols, int type) +{ + GpuMat m; + createContinuous(rows, cols, type, m); + return m; +} + +inline void createContinuous(Size size, int type, GpuMat& m) +{ + createContinuous(size.height, size.width, type, m); +} + +inline GpuMat createContinuous(Size size, int type) +{ + GpuMat m; + createContinuous(size, type, m); + return m; +} + + /////////////////////////////////////////////////////////////////////// //////////////////////////////// CudaMem //////////////////////////////// diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 76e079a..651565c 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -1147,38 +1147,27 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, int flags, int nonZeroRows, bo // We don't support real-to-real transform CV_Assert(is_complex_input || is_complex_output); - GpuMat src_data, src_aux; + GpuMat src_data; // Make sure here we work with the continuous input, // as CUFFT can't handle gaps - if (src.isContinuous()) - src_data = src_aux = src; - else - { - src_data = GpuMat(1, src.size().area(), src.type()); - src_aux = GpuMat(src.rows, src.cols, src.type(), src_data.ptr(), - src.cols * src.elemSize()); - src.copyTo(src_aux); + src_data = src; + createContinuous(src.rows, src.cols, src.type(), src_data); + if (src_data.data != src.data) + src.copyTo(src_data); - if (is_1d_input && !is_row_dft) - { - // If the source matrix is the single column - // reshape it into single row - int rows = std::min(src.rows, src.cols); - int cols = src.size().area() / rows; - src_aux = GpuMat(rows, cols, src.type(), src_data.ptr(), - cols * src.elemSize()); - } - } + if (is_1d_input && !is_row_dft) + // If the source matrix is single column reshape it into single row + src_data = src_data.reshape(0, std::min(src.rows, src.cols)); cufftType dft_type = CUFFT_R2C; if (is_complex_input) dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; - int dft_rows = src_aux.rows; - int dft_cols = src_aux.cols; + int dft_rows = src_data.rows; + int dft_cols = src_data.cols; if (is_complex_input && !is_complex_output) - dft_cols = (src_aux.cols - 1) * 2 + (int)odd; + dft_cols = (src_data.cols - 1) * 2 + (int)odd; CV_Assert(dft_cols > 1); cufftHandle plan; @@ -1187,99 +1176,45 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, int flags, int nonZeroRows, bo else cufftPlan2d(&plan, dft_rows, dft_cols, dft_type); - GpuMat dst_data, dst_aux; int dst_cols, dst_rows; - bool is_dst_mem_good; if (is_complex_input) { if (is_complex_output) { - is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32FC2 - && dst.cols >= src.cols && dst.rows >= src.rows; - - if (is_dst_mem_good) - dst_data = dst; - else - { - dst_data.create(1, src.size().area(), CV_32FC2); - dst_aux = GpuMat(src.rows, src.cols, dst_data.type(), dst_data.ptr(), - src.cols * dst_data.elemSize()); - } - + createContinuous(src.rows, src.cols, CV_32FC2, dst); cufftSafeCall(cufftExecC2C( - plan, src_data.ptr(), - dst_data.ptr(), + plan, src_data.ptr(), dst.ptr(), is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); - - if (!is_dst_mem_good) - { - dst.create(dst_aux.size(), dst_aux.type()); - dst_aux.copyTo(dst); - } } else { dst_rows = src.rows; dst_cols = (src.cols - 1) * 2 + (int)odd; - if (src_aux.size() != src.size()) + if (src_data.size() != src.size()) { dst_rows = (src.rows - 1) * 2 + (int)odd; dst_cols = src.cols; } - is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32F - && dst.cols >= dst_cols && dst.rows >= dst_rows; - - if (is_dst_mem_good) - dst_data = dst; - else - { - dst_data.create(1, dst_rows * dst_cols, CV_32F); - dst_aux = GpuMat(dst_rows, dst_cols, dst_data.type(), dst_data.ptr(), - dst_cols * dst_data.elemSize()); - } - + createContinuous(dst_rows, dst_cols, CV_32F, dst); cufftSafeCall(cufftExecC2R( - plan, src_data.ptr(), dst_data.ptr())); - - if (!is_dst_mem_good) - { - dst.create(dst_aux.size(), dst_aux.type()); - dst_aux.copyTo(dst); - } + plan, src_data.ptr(), dst.ptr())); } } else { dst_rows = src.rows; dst_cols = src.cols / 2 + 1; - if (src_aux.size() != src.size()) + if (src_data.size() != src.size()) { dst_rows = src.rows / 2 + 1; dst_cols = src.cols; } - is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32FC2 - && dst.cols >= dst_cols && dst.rows >= dst_rows; - - if (is_dst_mem_good) - dst_data = dst; - else - { - dst_data.create(1, dst_rows * dst_cols, CV_32FC2); - dst_aux = GpuMat(dst_rows, dst_cols, dst_data.type(), dst_data.ptr(), - dst_cols * dst_data.elemSize()); - } - + createContinuous(dst_rows, dst_cols, CV_32FC2, dst); cufftSafeCall(cufftExecR2C( - plan, src_data.ptr(), dst_data.ptr())); - - if (!is_dst_mem_good) - { - dst.create(dst_aux.size(), dst_aux.type()); - dst_aux.copyTo(dst); - } + plan, src_data.ptr(), dst.ptr())); } cufftSafeCall(cufftDestroy(plan)); @@ -1340,28 +1275,26 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, block_size.width = std::min(dft_size.width - templ.cols + 1, result.cols); block_size.height = std::min(dft_size.height - templ.rows + 1, result.rows); - GpuMat image_data(1, dft_size.area(), CV_32F); - GpuMat templ_data(1, dft_size.area(), CV_32F); - GpuMat result_data(1, dft_size.area(), CV_32F); + GpuMat result_data = createContinuous(dft_size, CV_32F); int spect_len = dft_size.height * (dft_size.width / 2 + 1); - GpuMat image_spect(1, spect_len, CV_32FC2); - GpuMat templ_spect(1, spect_len, CV_32FC2); - GpuMat result_spect(1, spect_len, CV_32FC2); + GpuMat image_spect = createContinuous(1, spect_len, CV_32FC2); + GpuMat templ_spect = createContinuous(1, spect_len, CV_32FC2); + GpuMat result_spect = createContinuous(1, spect_len, CV_32FC2); cufftHandle planR2C, planC2R; cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + GpuMat templ_block = createContinuous(dft_size, CV_32F); GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step); - GpuMat templ_block(dft_size, CV_32F, templ_data.ptr(), dft_size.width * sizeof(cufftReal)); copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, templ_block.cols - templ_roi.cols, 0); - cufftSafeCall(cufftExecR2C(planR2C, templ_data.ptr(), + cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr(), templ_spect.ptr())); - GpuMat image_block(dft_size, CV_32F, image_data.ptr(), dft_size.width * sizeof(cufftReal)); + GpuMat image_block = createContinuous(dft_size, CV_32F); // Process all blocks of the result matrix for (int y = 0; y < result.rows; y += block_size.height) @@ -1375,15 +1308,15 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, // Locate ROI in the source matrix GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr(y) + x), image.step); - // Make source image block continous + // Make source image block is continuous copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, 0, image_block.cols - image_roi.cols, 0); - cufftSafeCall(cufftExecR2C(planR2C, image_data.ptr(), + cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr(), image_spect.ptr())); mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0, - 1.f / dft_size.area(), ccorr); + 1.f / dft_size.area(), ccorr); cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr(), result_data.ptr())); @@ -1392,12 +1325,10 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, result_roi_size.width = std::min(x + block_size.width, result.cols) - x; result_roi_size.height = std::min(y + block_size.height, result.rows) - y; - GpuMat result_roi(result_roi_size, CV_32F, (void*)(result.ptr(y) + x), result.step); - GpuMat result_block(result_roi_size, CV_32F, result_data.ptr(), dft_size.width * sizeof(cufftReal)); + GpuMat result_roi(result_roi_size, result.type(), (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, result_data.type(), result_data.ptr(), result_data.step); - // Copy result block into appropriate part of the result matrix. - // We can't compute it inplace as the result of the CUFFT transforms - // is always continous, while the result matrix and its blocks can have gaps. + // Copy block into appropriate part of the result matrix result_block.copyTo(result_roi); } } diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index 56377a2..fcedee8 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -67,6 +67,8 @@ namespace cv void GpuMat::create(int /*_rows*/, int /*_cols*/, int /*_type*/) { throw_nogpu(); } void GpuMat::release() { throw_nogpu(); } + void createContinuous(int /*rows*/, int /*cols*/, int /*type*/, GpuMat& /*m*/) { throw_nogpu(); } + void CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); } bool CudaMem::canMapHostMemory() { throw_nogpu(); return false; } void CudaMem::release() { throw_nogpu(); } @@ -511,6 +513,10 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) void *dev_ptr; cudaSafeCall( cudaMallocPitch(&dev_ptr, &step, esz * cols, rows) ); + // Single row must be continuous + if (rows == 1) + step = esz * cols; + if (esz * cols == step) flags |= Mat::CONTINUOUS_FLAG; @@ -537,6 +543,14 @@ void cv::gpu::GpuMat::release() refcount = 0; } +void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m) +{ + int area = rows * cols; + if (!m.isContinuous() || m.type() != type || m.size().area() != area) + m.create(1, area, type); + m = m.reshape(0, rows); +} + /////////////////////////////////////////////////////////////////////// //////////////////////////////// CudaMem ////////////////////////////// diff --git a/tests/gpu/src/dft_routines.cpp b/tests/gpu/src/dft_routines.cpp index bdfea44..6a5ad38 100644 --- a/tests/gpu/src/dft_routines.cpp +++ b/tests/gpu/src/dft_routines.cpp @@ -411,6 +411,7 @@ struct CV_GpuDftTest: CvTest } if (ok) ok = cmp(a, Mat(d_c), rows * cols * 1e-5f); if (!ok) - ts->printf(CvTS::CONSOLE, "testR2CThenC2R failed: hint=%s, cols=%d, rows=%d\n", hint.c_str(), cols, rows); + ts->printf(CvTS::CONSOLE, "testR2CThenC2R failed: hint=%s, cols=%d, rows=%d, inplace=%d\n", + hint.c_str(), cols, rows, inplace); } } CV_GpuDftTest_inst; \ No newline at end of file diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp index cbeb0d0..366d2ee 100644 --- a/tests/gpu/src/gputest_main.cpp +++ b/tests/gpu/src/gputest_main.cpp @@ -47,6 +47,7 @@ const char* blacklist[] = { "GPU-AsyncGpuMatOperator", // crash "GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR + "GPU-BruteForceMatcher", // often crashes when seed=000001af5a11badd 0 }; -- 2.7.4