From 50b72197ab672d808e6883bd10294dc5a0edab69 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Mon, 29 Aug 2011 08:06:45 +0000
Subject: [PATCH] fixed bug in gpu::matchTemplate (added normalization routine
 to make the GPU version consistent with the CPU one), added test cases from
 the ticket #1341

---
 modules/gpu/src/cuda/match_template.cu | 84 ++++++++++++++++++++++------------
 modules/gpu/test/test_imgproc.cpp      | 81 ++++++++++++++++++++++++--------
 2 files changed, 117 insertions(+), 48 deletions(-)
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 301a0b6..3f8e92a 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -313,6 +313,29 @@ void matchTemplatePrepared_SQDIFF_8U(
 }
 
 
+// normAcc* are accurate normalization routines which make GPU matchTemplate
+// consistent with CPU one
+
+__device__ float normAcc(float num, float denum)
+{
+    if (fabs(num) < denum)
+        return num / denum;
+    if (fabs(num) < denum * 1.125f)
+        return num > 0 ? 1 : -1;
+    return 0;
+}
+
+
+__device__ float normAcc_SQDIFF(float num, float denum)
+{
+    if (fabs(num) < denum)
+        return num / denum;
+    if (fabs(num) < denum * 1.125f)
+        return num > 0 ? 1 : -1;
+    return 1;
+}
+
+
 template <int cn>
 __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
         int w, int h, const PtrStep_<unsigned long long> image_sqsum, 
@@ -327,8 +350,8 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
                 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
                 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
         float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = min(1.f, (image_sqsum_ - 2.f * ccorr + templ_sqsum) * 
-                           rsqrtf(image_sqsum_ * templ_sqsum));
+        result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                          sqrtf(image_sqsum_ * templ_sqsum));
     }
 }
 
@@ -440,7 +463,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
 
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
         int w, int h, 
-        float templ_sum_scale_r, 
+        float templ_sum_scale_r,
         float templ_sum_scale_g,
         float templ_sum_scale_b,
         const PtrStep_<unsigned int> image_sum_r,
@@ -463,7 +486,7 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
                 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
                 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
         float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
                                  - image_sum_g_ * templ_sum_scale_g
                                  - image_sum_b_ * templ_sum_scale_b;
     }
@@ -484,8 +507,8 @@ void matchTemplatePrepared_CCOFF_8UC3(
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
     matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(
             w, h, 
-            (float)templ_sum_r / (w * h), 
-            (float)templ_sum_g / (w * h), 
+            (float)templ_sum_r / (w * h),
+            (float)templ_sum_g / (w * h),
             (float)templ_sum_b / (w * h),
             image_sum_r, image_sum_g, image_sum_b, result);
     cudaSafeCall( cudaGetLastError() );
@@ -579,8 +602,8 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
         float image_sqsum_ = (float)(
                 (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
                 (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
-        result.ptr(y)[x] = (ccorr - image_sum_ * templ_sum_scale) * 
-                           rsqrtf(templ_sqsum_scale * max(1e-3f, image_sqsum_ - weight * image_sum_ * image_sum_));
+        result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                   sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
     }
 }
 
@@ -631,11 +654,12 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
         float image_sqsum_g_ = (float)(
                 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
                 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        float rdenom = rsqrtf(templ_sqsum_scale * max(1e-3f, image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 
-                                                             + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
-        result.ptr(y)[x] = (ccorr - image_sum_r_ * templ_sum_scale_r
-                                  - image_sum_g_ * templ_sum_scale_g) * rdenom;
+
+        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                     - image_sum_g_ * templ_sum_scale_g;
+        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+        result.ptr(y)[x] = normAcc(num, denum);
     }
 }
 
@@ -701,13 +725,14 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
         float image_sqsum_b_ = (float)(
                 (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
                 (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        float rdenom = rsqrtf(templ_sqsum_scale * max(1e-3f, image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 
-                                                             + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                             + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
-        result.ptr(y)[x] = (ccorr - image_sum_r_ * templ_sum_scale_r
-                                  - image_sum_g_ * templ_sum_scale_g
-                                  - image_sum_b_ * templ_sum_scale_b) * rdenom;
+
+        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                     - image_sum_g_ * templ_sum_scale_g
+                                     - image_sum_b_ * templ_sum_scale_b;
+        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+        result.ptr(y)[x] = normAcc(num, denum);
     }
 }
 
@@ -785,15 +810,14 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
         float image_sqsum_a_ = (float)(
                 (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
                 (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
-        float ccorr = result.ptr(y)[x];
-        float rdenom = rsqrtf(templ_sqsum_scale * max(1e-3f, image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                             + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                             + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
-                                                             + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
-        result.ptr(y)[x] = (ccorr - image_sum_r_ * templ_sum_scale_r
-                                  - image_sum_g_ * templ_sum_scale_g
-                                  - image_sum_b_ * templ_sum_scale_b
-                                  - image_sum_a_ * templ_sum_scale_a) * rdenom;
+
+        float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                     - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+        float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                 + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+        result.ptr(y)[x] = normAcc(num, denum);
     }
 }
 
@@ -850,7 +874,7 @@ __global__ void normalizeKernel_8U(
         float image_sqsum_ = (float)(
                 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
                 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-        result.ptr(y)[x] = result.ptr(y)[x] * rsqrtf(max(1.f, image_sqsum_) * templ_sqsum);
+        result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
     }
 }
 
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index ab01142..169a10b 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -3594,13 +3594,8 @@ INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate32F, testing::Combine(
                         testing::Range(1, 5), 
                         testing::Values((int)CV_TM_SQDIFF, (int)CV_TM_CCORR)));
 
-struct MatchTemplate : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int> >
+struct MatchTemplateBlackSource : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int> >
 {
-    cv::Mat image;
-    cv::Mat pattern;
-
-    cv::Point maxLocGold;
-
     cv::gpu::DeviceInfo devInfo;
     int method;
 
@@ -3608,26 +3603,25 @@ struct MatchTemplate : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceIn
     {
         devInfo = std::tr1::get<0>(GetParam());
         method = std::tr1::get<1>(GetParam());
-
         cv::gpu::setDevice(devInfo.deviceID());
-        
-        image = readImage("matchtemplate/black.png");
-        ASSERT_FALSE(image.empty());
-        
-        pattern = readImage("matchtemplate/cat.png");
-        ASSERT_FALSE(pattern.empty());
-
-        maxLocGold = cv::Point(284, 12);
     }
 };
 
-TEST_P(MatchTemplate, FindPatternInBlack)
+TEST_P(MatchTemplateBlackSource, Accuracy)
 {
     const char* matchTemplateMethodStr = matchTemplateMethods[method];
 
     PRINT_PARAM(devInfo);
     PRINT_PARAM(matchTemplateMethodStr);
 
+    cv::Mat image = readImage("matchtemplate/black.png");
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat pattern = readImage("matchtemplate/cat.png");
+    ASSERT_FALSE(pattern.empty());
+
+    cv::Point maxLocGold = cv::Point(284, 12);
+
     cv::Mat dst;
 
     ASSERT_NO_THROW(
@@ -3643,10 +3637,61 @@ TEST_P(MatchTemplate, FindPatternInBlack)
     ASSERT_EQ(maxLocGold, maxLoc);
 }
 
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate, testing::Combine(
-                        testing::ValuesIn(devices()), 
+INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplateBlackSource, testing::Combine(
+                        testing::ValuesIn(devices()),
                         testing::Values((int)CV_TM_CCOEFF_NORMED, (int)CV_TM_CCORR_NORMED)));
 
+
+struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, std::tr1::tuple<const char*, const char*> > >
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Mat image, pattern;
+
+    virtual void SetUp()
+    {
+        devInfo = std::tr1::get<0>(GetParam());
+
+        image = readImage(std::tr1::get<0>(std::tr1::get<1>(GetParam())));
+        ASSERT_FALSE(image.empty());
+
+        pattern = readImage(std::tr1::get<1>(std::tr1::get<1>(GetParam())));
+        ASSERT_FALSE(pattern.empty());
+    }
+};
+
+TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
+{
+    PRINT_PARAM(devInfo);
+
+    cv::Mat dstGold;
+    cv::matchTemplate(image, pattern, dstGold, CV_TM_CCOEFF_NORMED);
+    cv::Point minLocGold, maxLocGold;
+    cv::minMaxLoc(dstGold, NULL, NULL, &minLocGold, &maxLocGold);
+
+    cv::Mat dst;
+    ASSERT_NO_THROW(
+        cv::gpu::GpuMat dev_dst;
+        cv::gpu::matchTemplate(cv::gpu::GpuMat(image), cv::gpu::GpuMat(pattern), dev_dst, CV_TM_CCOEFF_NORMED);
+        dev_dst.download(dst);
+    );
+
+    cv::Point minLoc, maxLoc;
+    double minVal, maxVal;
+    cv::minMaxLoc(dst, &minVal, &maxVal, &minLoc, &maxLoc);
+
+    ASSERT_EQ(minLocGold, minLoc);
+    ASSERT_EQ(maxLocGold, maxLoc);
+    ASSERT_LE(maxVal, 1.);
+    ASSERT_GE(minVal, -1.);
+}
+
+
+INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
+                        testing::ValuesIn(devices()),
+                        testing::Values(std::tr1::make_tuple("matchtemplate/source-0.png", "matchtemplate/target-0.png"),
+                                        std::tr1::make_tuple("matchtemplate/source-1.png", "matchtemplate/target-1.png"))));
+
+
 ////////////////////////////////////////////////////////////////////////////
 // MulSpectrums
 
-- 
2.7.4