From e5eec31be14e7c3a8090015fc299a727d9b3f84a Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Wed, 15 Dec 2010 12:10:30 +0000
Subject: [PATCH] fixed minor bugs in gpu module

---
 modules/gpu/src/cuda/match_template.cu | 44 ++++++++++++++++++++++++++++++++--
 tests/gpu/src/arithm.cpp               |  2 +-
 2 files changed, 43 insertions(+), 3 deletions(-)
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 575f16c..4e388a5 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -42,7 +42,7 @@
 
 #include <cufft.h>
 #include "internal_shared.hpp"
-#include "../opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vecmath.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -386,10 +386,10 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
 
     if (x < result.cols && y < result.rows)
     {
-        float ccorr = result.ptr(y)[x];
         float image_sum_ = (float)(
                 (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
                 (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+        float ccorr = result.ptr(y)[x];
         result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
     }
 }
@@ -407,6 +407,46 @@ void matchTemplatePrepared_CCOFF_8U(
 }
 
 
+__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+        int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+        const PtrStep_<unsigned int> image_sum_r,
+        const PtrStep_<unsigned int> image_sum_g,
+        DevMem2Df result)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < result.cols && y < result.rows)
+    {
+        float image_sum_r_ = (float)(
+                (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+        float image_sum_g_ = (float)(
+                (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+        float ccorr = result.ptr(y)[x];
+        result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                                 - image_sum_g_ * templ_sum_scale_g;
+    }
+}
+
+
+void matchTemplatePrepared_CCOFF_8UC2(
+        int w, int h, 
+        const DevMem2D_<unsigned int> image_sum_r, 
+        const DevMem2D_<unsigned int> image_sum_g,
+        unsigned int templ_sum_r, unsigned int templ_sum_g, 
+        DevMem2Df result)
+{
+    dim3 threads(32, 8);
+    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
+            w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+            image_sum_r, image_sum_g, result);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
         int w, int h, float weight, 
         float templ_sum_scale, float templ_sqsum_scale,
diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp
index abe76a1..944ec09 100644
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -945,7 +945,7 @@ struct CV_GpuSumTest: CvTest
             int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;
             for (int type = CV_8U; type <= typemax; ++type) 
             {
-                gen(1 + rand() % 1000, 1 + rand() % 1000, type, src);
+                gen(1 + rand() % 500, 1 + rand() % 500, type, src);
                 a = sum(src);
                 b = sum(GpuMat(src));
                 if (abs(a[0] - b[0]) > src.size().area() * max_err)
-- 
2.7.4