From 39700c5d541338d3c6108444d0961eed6f85634a Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Mon, 13 Dec 2010 16:48:34 +0000
Subject: [PATCH] added some gpu::matchTemplate kernels (other parts after NPP
 Staging integration)

---
 modules/gpu/src/cuda/match_template.cu | 72 ++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 8 deletions(-)
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index f048775..abfd8c3 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -175,7 +175,7 @@ void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a,
 
 
 __global__ void matchTemplatePreparedKernel_8U_SQDIFF(
-        int w, int h, const PtrStepf image_sumsq, float templ_sumsq,
+        int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,
         DevMem2Df result)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -183,24 +183,80 @@ __global__ void matchTemplatePreparedKernel_8U_SQDIFF(
 
     if (x < result.cols && y < result.rows)
     {
-        float image_sq = image_sumsq.ptr(y + h)[x + w] 
-                         - image_sumsq.ptr(y)[x + w]
-                         - image_sumsq.ptr(y + h)[x]
-                         + image_sumsq.ptr(y)[x];
+        float image_sq = (float)(
+                (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
         float ccorr = result.ptr(y)[x];
-        result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sumsq;
+        result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sqsum;
     }
 }
 
 
 void matchTemplatePrepared_8U_SQDIFF(
-        int w, int h, const DevMem2Df image_sumsq, float templ_sumsq,
+        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,
         DevMem2Df result)
 {
     dim3 threads(32, 8);
     dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
     matchTemplatePreparedKernel_8U_SQDIFF<<<grid, threads>>>(
-            w, h, image_sumsq, templ_sumsq, result);
+            w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
+__global__ void matchTemplatePreparedKernel_8U_SQDIFF_NORMED(
+        int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,
+        DevMem2Df result)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < result.cols && y < result.rows)
+    {
+        float image_sq = (float)(
+                (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+        float ccorr = result.ptr(y)[x];
+        result.ptr(y)[x] = (image_sq - 2.f * ccorr + templ_sqsum) * 
+                           rsqrtf(image_sq * templ_sqsum);
+    }
+}
+
+
+void matchTemplatePrepared_8U_SQDIFF_NORMED(
+        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,
+        DevMem2Df result)
+{
+    dim3 threads(32, 8);
+    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    matchTemplatePreparedKernel_8U_SQDIFF_NORMED<<<grid, threads>>>(
+            w, h, image_sqsum, templ_sqsum, result);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
+__global__ void normalizeKernel_8U(int w, int h, const PtrStep_<unsigned long long> image_sqsum, 
+                                   float templ_sqsum, DevMem2Df result)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < result.cols && y < result.rows)
+    {
+        float image_sq = (float)(
+                (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+        result.ptr(y)[x] *= rsqrtf(image_sq * templ_sqsum);
+    }
+}
+
+
+void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+                  float templ_sqsum, DevMem2Df result)
+{
+    dim3 threads(32, 8);
+    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    normalizeKernel_8U<<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
     cudaSafeCall(cudaThreadSynchronize());
 }
 
-- 
2.7.4