From 21b081deff156578183f94b7ba3b94777d877fd2 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Fri, 24 Dec 2010 09:26:19 +0000
Subject: [PATCH] now single row GPU matrix is continuous one, added aux.
 functions, updated dft and matchTemplates

---
 modules/gpu/include/opencv2/gpu/gpu.hpp            |   3 +
 .../gpu/include/opencv2/gpu/matrix_operations.hpp  |  20 ++++
 modules/gpu/src/imgproc_gpu.cpp                    | 133 +++++----------------
 modules/gpu/src/matrix_operations.cpp              |  14 +++
 tests/gpu/src/dft_routines.cpp                     |   3 +-
 tests/gpu/src/gputest_main.cpp                     |   1 +
 6 files changed, 72 insertions(+), 102 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index c5d1df3..6c11c19 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -246,6 +246,9 @@ namespace cv
     #include "GpuMat_BetaDeprecated.hpp"
 #endif
 
+        //! creates continuous GPU matrix
+        CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
+
         //////////////////////////////// CudaMem ////////////////////////////////
         // CudaMem is limited cv::Mat with page locked memory allocation.
         // Page locked memory is only needed for async and faster coping to GPU.
diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
index 5d4f06a..569eb9a 100644
--- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
@@ -345,6 +345,26 @@ inline GpuMat GpuMat::t() const
 
 static inline void swap( GpuMat& a, GpuMat& b ) { a.swap(b); }
 
+inline GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+inline void createContinuous(Size size, int type, GpuMat& m)
+{
+    createContinuous(size.height, size.width, type, m);
+}
+
+inline GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+
 
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////////// CudaMem ////////////////////////////////
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 76e079a..651565c 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -1147,38 +1147,27 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, int flags, int nonZeroRows, bo
     // We don't support real-to-real transform
     CV_Assert(is_complex_input || is_complex_output);
 
-    GpuMat src_data, src_aux;
+    GpuMat src_data;
 
     // Make sure here we work with the continuous input, 
     // as CUFFT can't handle gaps
-    if (src.isContinuous())
-        src_data = src_aux = src;
-    else
-    {
-        src_data = GpuMat(1, src.size().area(), src.type());
-        src_aux = GpuMat(src.rows, src.cols, src.type(), src_data.ptr(),
-                         src.cols * src.elemSize());
-        src.copyTo(src_aux);
+    src_data = src;
+    createContinuous(src.rows, src.cols, src.type(), src_data);
+    if (src_data.data != src.data)
+        src.copyTo(src_data);
 
-        if (is_1d_input && !is_row_dft)
-        {
-            // If the source matrix is the single column
-            // reshape it into single row
-            int rows = std::min(src.rows, src.cols);
-            int cols = src.size().area() / rows;
-            src_aux = GpuMat(rows, cols, src.type(), src_data.ptr(), 
-                             cols * src.elemSize());
-        }
-    }
+    if (is_1d_input && !is_row_dft)
+        // If the source matrix is single column reshape it into single row
+        src_data = src_data.reshape(0, std::min(src.rows, src.cols));
 
     cufftType dft_type = CUFFT_R2C;
     if (is_complex_input) 
         dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
 
-    int dft_rows = src_aux.rows;
-    int dft_cols = src_aux.cols;
+    int dft_rows = src_data.rows;
+    int dft_cols = src_data.cols;
     if (is_complex_input && !is_complex_output)
-        dft_cols = (src_aux.cols - 1) * 2 + (int)odd;
+        dft_cols = (src_data.cols - 1) * 2 + (int)odd;
     CV_Assert(dft_cols > 1);
 
     cufftHandle plan;
@@ -1187,99 +1176,45 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, int flags, int nonZeroRows, bo
     else
         cufftPlan2d(&plan, dft_rows, dft_cols, dft_type);
 
-    GpuMat dst_data, dst_aux;
     int dst_cols, dst_rows;
-    bool is_dst_mem_good;
 
     if (is_complex_input)
     {
         if (is_complex_output)
         {
-            is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32FC2 
-                              && dst.cols >= src.cols && dst.rows >= src.rows;
-
-            if (is_dst_mem_good)
-                dst_data = dst;
-            else
-            {
-                dst_data.create(1, src.size().area(), CV_32FC2);
-                dst_aux = GpuMat(src.rows, src.cols, dst_data.type(), dst_data.ptr(),
-                                 src.cols * dst_data.elemSize());
-            }
-
+            createContinuous(src.rows, src.cols, CV_32FC2, dst);
             cufftSafeCall(cufftExecC2C(
-                    plan, src_data.ptr<cufftComplex>(),
-                    dst_data.ptr<cufftComplex>(),
+                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
                     is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
-
-            if (!is_dst_mem_good)
-            {
-                dst.create(dst_aux.size(), dst_aux.type());
-                dst_aux.copyTo(dst);
-            }
         }
         else
         {
             dst_rows = src.rows;
             dst_cols = (src.cols - 1) * 2 + (int)odd;
-            if (src_aux.size() != src.size())
+            if (src_data.size() != src.size())
             {
                 dst_rows = (src.rows - 1) * 2 + (int)odd;
                 dst_cols = src.cols;
             }
 
-            is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32F
-                              && dst.cols >= dst_cols && dst.rows >= dst_rows;
-
-            if (is_dst_mem_good)
-                dst_data = dst;
-            else
-            {
-                dst_data.create(1, dst_rows * dst_cols, CV_32F);
-                dst_aux = GpuMat(dst_rows, dst_cols, dst_data.type(), dst_data.ptr(), 
-                                 dst_cols * dst_data.elemSize());
-            }
-
+            createContinuous(dst_rows, dst_cols, CV_32F, dst);
             cufftSafeCall(cufftExecC2R(
-                    plan, src_data.ptr<cufftComplex>(), dst_data.ptr<cufftReal>()));
-
-            if (!is_dst_mem_good)
-            {
-                dst.create(dst_aux.size(), dst_aux.type());
-                dst_aux.copyTo(dst);
-            }
+                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
         }
     }
     else
     {
         dst_rows = src.rows;
         dst_cols = src.cols / 2 + 1;
-        if (src_aux.size() != src.size())
+        if (src_data.size() != src.size())
         {
             dst_rows = src.rows / 2 + 1;
             dst_cols = src.cols;
         }
 
-        is_dst_mem_good = dst.isContinuous() && dst.type() == CV_32FC2 
-                          && dst.cols >= dst_cols && dst.rows >= dst_rows;
-
-        if (is_dst_mem_good)
-            dst_data = dst;
-        else
-        {
-            dst_data.create(1, dst_rows * dst_cols, CV_32FC2);
-            dst_aux = GpuMat(dst_rows, dst_cols, dst_data.type(), dst_data.ptr(), 
-                             dst_cols * dst_data.elemSize());
-        }
-
+        createContinuous(dst_rows, dst_cols, CV_32FC2, dst);
         cufftSafeCall(cufftExecR2C(
-                plan, src_data.ptr<cufftReal>(), dst_data.ptr<cufftComplex>()));
-
-        if (!is_dst_mem_good)
-        {
-            dst.create(dst_aux.size(), dst_aux.type());
-            dst_aux.copyTo(dst);
-        }
+                plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
     }
 
     cufftSafeCall(cufftDestroy(plan));
@@ -1340,28 +1275,26 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     block_size.width = std::min(dft_size.width - templ.cols + 1, result.cols);
     block_size.height = std::min(dft_size.height - templ.rows + 1, result.rows);
 
-    GpuMat image_data(1, dft_size.area(), CV_32F);
-    GpuMat templ_data(1, dft_size.area(), CV_32F);
-    GpuMat result_data(1, dft_size.area(), CV_32F);
+    GpuMat result_data = createContinuous(dft_size, CV_32F);
 
     int spect_len = dft_size.height * (dft_size.width / 2 + 1);
-    GpuMat image_spect(1, spect_len, CV_32FC2);
-    GpuMat templ_spect(1, spect_len, CV_32FC2);
-    GpuMat result_spect(1, spect_len, CV_32FC2);
+    GpuMat image_spect = createContinuous(1, spect_len, CV_32FC2);
+    GpuMat templ_spect = createContinuous(1, spect_len, CV_32FC2);
+    GpuMat result_spect = createContinuous(1, spect_len, CV_32FC2);
 
     cufftHandle planR2C, planC2R;
     cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
     cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
 
+    GpuMat templ_block = createContinuous(dft_size, CV_32F);
     GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    GpuMat templ_block(dft_size, CV_32F, templ_data.ptr(), dft_size.width * sizeof(cufftReal));
     copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
                    templ_block.cols - templ_roi.cols, 0);
 
-    cufftSafeCall(cufftExecR2C(planR2C, templ_data.ptr<cufftReal>(), 
+    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
                                templ_spect.ptr<cufftComplex>()));
 
-    GpuMat image_block(dft_size, CV_32F, image_data.ptr(), dft_size.width * sizeof(cufftReal));
+    GpuMat image_block = createContinuous(dft_size, CV_32F);
 
     // Process all blocks of the result matrix
     for (int y = 0; y < result.rows; y += block_size.height)
@@ -1375,15 +1308,15 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
             // Locate ROI in the source matrix
             GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), image.step);
 
-            // Make source image block continous
+            // Make source image block is continuous
             copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, 0, 
                            image_block.cols - image_roi.cols, 0);
 
-            cufftSafeCall(cufftExecR2C(planR2C, image_data.ptr<cufftReal>(), 
+            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
                                        image_spect.ptr<cufftComplex>()));
 
             mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                1.f / dft_size.area(), ccorr);
+                                 1.f / dft_size.area(), ccorr);
 
             cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
                                        result_data.ptr<cufftReal>()));
@@ -1392,12 +1325,10 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
             result_roi_size.width = std::min(x + block_size.width, result.cols) - x;
             result_roi_size.height = std::min(y + block_size.height, result.rows) - y;
 
-            GpuMat result_roi(result_roi_size, CV_32F, (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, CV_32F, result_data.ptr(), dft_size.width * sizeof(cufftReal));
+            GpuMat result_roi(result_roi_size, result.type(), (void*)(result.ptr<float>(y) + x), result.step);
+            GpuMat result_block(result_roi_size, result_data.type(), result_data.ptr(), result_data.step);
 
-            // Copy result block into appropriate part of the result matrix.
-            // We can't compute it inplace as the result of the CUFFT transforms
-            // is always continous, while the result matrix and its blocks can have gaps.
+            // Copy block into appropriate part of the result matrix
             result_block.copyTo(result_roi);
         }
     }
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 56377a2..fcedee8 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -67,6 +67,8 @@ namespace cv
         void GpuMat::create(int /*_rows*/, int /*_cols*/, int /*_type*/) { throw_nogpu(); }
         void GpuMat::release() { throw_nogpu(); }
 
+        void createContinuous(int /*rows*/, int /*cols*/, int /*type*/, GpuMat& /*m*/) { throw_nogpu(); }
+
         void CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
         bool CudaMem::canMapHostMemory() { throw_nogpu(); return false; }
         void CudaMem::release() { throw_nogpu(); }
@@ -511,6 +513,10 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
         void *dev_ptr;
         cudaSafeCall( cudaMallocPitch(&dev_ptr, &step, esz * cols, rows) );
 
+        // Single row must be continuous
+        if (rows == 1)
+            step = esz * cols;
+
         if (esz * cols == step)
             flags |= Mat::CONTINUOUS_FLAG;
 
@@ -537,6 +543,14 @@ void cv::gpu::GpuMat::release()
     refcount = 0;
 }
 
+void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
+{
+    int area = rows * cols;
+    if (!m.isContinuous() || m.type() != type || m.size().area() != area)
+        m.create(1, area, type);
+    m = m.reshape(0, rows);
+}
+
 
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////////// CudaMem //////////////////////////////
diff --git a/tests/gpu/src/dft_routines.cpp b/tests/gpu/src/dft_routines.cpp
index bdfea44..6a5ad38 100644
--- a/tests/gpu/src/dft_routines.cpp
+++ b/tests/gpu/src/dft_routines.cpp
@@ -411,6 +411,7 @@ struct CV_GpuDftTest: CvTest
         }
         if (ok) ok = cmp(a, Mat(d_c), rows * cols * 1e-5f);
         if (!ok) 
-            ts->printf(CvTS::CONSOLE, "testR2CThenC2R failed: hint=%s, cols=%d, rows=%d\n", hint.c_str(), cols, rows);
+            ts->printf(CvTS::CONSOLE, "testR2CThenC2R failed: hint=%s, cols=%d, rows=%d, inplace=%d\n", 
+                       hint.c_str(), cols, rows, inplace);
     }
 } CV_GpuDftTest_inst;
\ No newline at end of file
diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp
index cbeb0d0..366d2ee 100644
--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -47,6 +47,7 @@ const char* blacklist[] =
 {
     "GPU-AsyncGpuMatOperator",     // crash
     "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
+    "GPU-BruteForceMatcher", // often crashes when seed=000001af5a11badd
     0
 };
 
-- 
2.7.4