more fix of mismatch

author yao <bitwangyaoyao@gmail.com>

Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)

committer yao <bitwangyaoyao@gmail.com>

Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)
author yao <bitwangyaoyao@gmail.com>
Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)
committer yao <bitwangyaoyao@gmail.com>
Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)
diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp

index ab867d4..1f76d63 100644 (file)
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -71,6 +71,9 @@ namespace cv
          void matchTemplate_SQDIFF_NORMED(
              const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  
+        void convolve_32F(
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
+
          void matchTemplate_CCORR(
              const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  
@@ -90,41 +93,65 @@ namespace cv
          void matchTemplateNaive_CCORR(
              const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  
+        void extractFirstChannel_32F(
+            const oclMat &image, oclMat &result);
+
          // Evaluates optimal template's area threshold. If
          // template's area is less  than the threshold, we use naive match
          // template version, otherwise FFT-based (if available)
-        static int getTemplateThreshold(int method, int depth)
+        static bool useNaive(int , int , Size )
          {
-            switch (method)
-            {
-            case CV_TM_CCORR:
-                if (depth == CV_32F) return 250;
-                if (depth == CV_8U) return 300;
-                break;
-            case CV_TM_SQDIFF:
-                if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
-                if (depth == CV_8U) return 300;
-                break;
-            }
-            CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
-            return 0;
+            // FIXME!
+            //   always use naive until convolve is imported
+            return true; 
          }
  
          //////////////////////////////////////////////////////////////////////
          // SQDIFF
          void matchTemplate_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &)
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
          {
              result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+            if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
              {
                  matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
                  return;
              }
              else
              {
-                // TODO
-                CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+                buf.image_sqsums.resize(1);
+                
+                // TODO, add double support for ocl::integral
+                // use CPU integral temporarily
+                Mat sums, sqsums;
+                cv::integral(Mat(image.reshape(1)), sums, sqsums);
+                buf.image_sqsums[0] = sqsums;
+
+                unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+                matchTemplate_CCORR(image, templ, result, buf);
+
+                //port CUDA's matchTemplatePrepared_SQDIFF_8U
+                Context *clCxt = image.clCxt;
+                string kernelName = "matchTemplate_Prepared_SQDIFF";
+                vector< pair<size_t, const void *> > args;
+
+                args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
+                args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+                args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+                size_t globalThreads[3] = {result.cols, result.rows, 1};
+                size_t localThreads[3]  = {16, 16, 1};
+
+                const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
+                openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
              }
          }
  
@@ -134,7 +161,6 @@ namespace cv
              matchTemplate_CCORR(image, templ, result, buf);
              buf.image_sums.resize(1);
  
-
              integral(image.reshape(1), buf.image_sums[0]);
  
              unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
@@ -156,7 +182,7 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
  
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
              openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
          }
  
@@ -191,33 +217,39 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
  
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
              openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
          }
  
          //////////////////////////////////////////////////////////////////////
          // CCORR
+        void convolve_32F(
+            const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
+        {
+            CV_Error(-1, "convolve is not fully implemented yet");
+        }
+
          void matchTemplate_CCORR(
              const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
          {
              result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+            if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
              {
                  matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
                  return;
              }
              else
              {
-                CV_Error(CV_StsBadArg, "Not supported yet for this size template");
                  if(image.depth() == CV_8U && templ.depth() == CV_8U)
                  {
                      image.convertTo(buf.imagef, CV_32F);
                      templ.convertTo(buf.templf, CV_32F);
+                    convolve_32F(buf.imagef, buf.templf, result, buf);
+                }
+                else
+                {
+                    convolve_32F(image, templ, result, buf);
                  }
-                CV_Assert(image.oclchannels() == 1);
-                oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
-                filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
-                result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
              }
          }
  
@@ -249,7 +281,7 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
  
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
              openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
          }
  
@@ -284,7 +316,7 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
  
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
              openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
          }
          //////////////////////////////////////////////////////////////////////
@@ -301,7 +333,7 @@ namespace cv
  
              kernelName = "matchTemplate_Prepared_CCOFF";
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
  
              vector< pair<size_t, const void *> > args;
              args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@@ -313,22 +345,22 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+            Vec4f templ_sum = Vec4f::all(0);
              // to be continued in the following section
              if(image.oclchannels() == 1)
              {
                  buf.image_sums.resize(1);
                  integral(image, buf.image_sums[0]);
  
-                float templ_sum = 0;
-                templ_sum = (float)sum(templ)[0] / templ.size().area();
+                templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
                  args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
                  args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
                  args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
              }
              else
              {
-                Vec4f templ_sum = Vec4f::all(0);
+                
                  split(image, buf.images);
                  templ_sum = sum(templ) / templ.size().area();
                  buf.image_sums.resize(buf.images.size());
@@ -374,7 +406,7 @@ namespace cv
  
              kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
              size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
  
              vector< pair<size_t, const void *> > args;
              args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@@ -387,20 +419,22 @@ namespace cv
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
              args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
              args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
+
+            Vec4f templ_sum   = Vec4f::all(0);
+            Vec4f templ_sqsum = Vec4f::all(0);
              // to be continued in the following section
              if(image.oclchannels() == 1)
              {
                  buf.image_sums.resize(1);
                  buf.image_sqsums.resize(1);
                  integral(image, buf.image_sums[0], buf.image_sqsums[0]);
-                float templ_sum = 0;
-                float templ_sqsum = 0;
-                templ_sum   = (float)sum(templ)[0];
  
-                templ_sqsum = sqrSum(templ)[0];
+                templ_sum[0]   = (float)sum(templ)[0];
  
-                templ_sqsum -= scale * templ_sum * templ_sum;
-                templ_sum   *= scale;
+                templ_sqsum[0] = sqrSum(templ)[0];
+
+                templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
+                templ_sum[0]   *= scale;
  
                  args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
                  args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
@@ -408,13 +442,11 @@ namespace cv
                  args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
                  args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
                  args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
              }
              else
              {
-                Vec4f templ_sum   = Vec4f::all(0);
-                Vec4f templ_sqsum = Vec4f::all(0);
  
                  split(image, buf.images);
                  templ_sum   = sum(templ);
@@ -465,7 +497,27 @@ namespace cv
              }
              openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
          }
+        void extractFirstChannel_32F(const oclMat &image, oclMat &result)
+        {
+            Context *clCxt = image.clCxt;
+            string kernelName;
+
+            kernelName = "extractFirstChannel";
+            size_t globalThreads[3] = {result.cols, result.rows, 1};
+            size_t localThreads[3]  = {16, 16, 1};
  
+            vector< pair<size_t, const void *> > args;
+            args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
+            args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
      }/*ocl*/
  } /*cv*/
  
diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl

index 3133e62..857f891 100644 (file)
--- a/modules/ocl/src/opencl/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@@ -45,22 +45,28 @@
  
  #pragma OPENCL EXTENSION cl_amd_printf : enable
  
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#if defined (DOUBLE_SUPPORT)
  
-#elif defined (__NVIDIA__)
+#ifdef cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
  #endif
  
-#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
  #define TYPE_IMAGE_SQSUM double
  #else
-#define TYPE_IMAGE_SQSUM ulong
+#define TYPE_IMAGE_SQSUM float
+#endif
+
+#ifndef CN4
+#define CN4 1
+#else
+#define CN4 4
  #endif
  
  //////////////////////////////////////////////////
  // utilities
-#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
  #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
  // normAcc* are accurate normalization routines which make GPU matchTemplate
  // consistent with CPU one
@@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum)
  __kernel
  void normalizeKernel_C1_D0
  (
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
      __global float * res,
      ulong tpl_sqsum,
      int res_rows,
@@ -119,8 +125,8 @@ void normalizeKernel_C1_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
          res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
      }
  }
@@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
          res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
      }
  }
@@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
  __kernel
  void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
  (
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
      __global float * res,
      ulong tpl_sqsum,
      int res_rows,
@@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
          res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
-                                        sqrt(image_sqsum_ * tpl_sqsum));
+                                      sqrt(image_sqsum_ * tpl_sqsum));
      }
  }
  
@@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float sum = (float)(
-            (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+                        (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                        - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
          res[res_idx] -= sum * tpl_sum;
      }
  }
@@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0
      {
          float ccorr = res[res_idx];
          ccorr -= tpl_sum_c0*(float)(
-            (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+                     (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
          ccorr -= tpl_sum_c1*(float)(
-            (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+                     (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
          ccorr -= tpl_sum_c2*(float)(
-            (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+                     (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
          ccorr -= tpl_sum_c3*(float)(
-            (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+                     (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
          res[res_idx] = ccorr;
      }
  }
@@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
      __global const uint * img_sums,
      int img_sums_offset,
      int img_sums_step,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
      int img_sqsums_offset,
      int img_sqsums_step,
      float tpl_sum,
@@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float image_sum_ =  (float)(
-            (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+                                (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                                - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
  
          float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
          res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
                                 sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
      }
@@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
      __global const uint * img_sums_c3,
      int img_sums_offset,
      int img_sums_step,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
+    __global const float * img_sqsums_c0,
+    __global const float * img_sqsums_c1,
+    __global const float * img_sqsums_c2,
+    __global const float * img_sqsums_c3,
      int img_sqsums_offset,
      int img_sqsums_step,
      float tpl_sum_c0,
@@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
      if(gidx < res_cols && gidy < res_rows)
      {
          float image_sum_c0 =  (float)(
-            (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+                                  (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
          float image_sum_c1 =  (float)(
-            (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+                                  (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
          float image_sum_c2 =  (float)(
-            (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+                                  (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
          float image_sum_c3 =  (float)(
-            (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+                                  (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
  
          float image_sqsum_c0 = (float)(
-            (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
          float image_sqsum_c1 = (float)(
-            (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
          float image_sqsum_c2 = (float)(
-            (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
          float image_sqsum_c3 = (float)(
-            (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
  
          float num = res[res_idx] -
-            image_sum_c0 * tpl_sum_c0 -
-            image_sum_c1 * tpl_sum_c1 -
-            image_sum_c2 * tpl_sum_c2 -
-            image_sum_c3 * tpl_sum_c3;
+                    image_sum_c0 * tpl_sum_c0 -
+                    image_sum_c1 * tpl_sum_c1 -
+                    image_sum_c2 * tpl_sum_c2 -
+                    image_sum_c3 * tpl_sum_c3;
          float denum = sqrt( tpl_sqsum * (
-            image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
-            image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
-            image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
-            image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
-            );
+                                image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+                                image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+                                image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+                                image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+                          );
          res[res_idx] = normAcc(num, denum);
      }
  }
+
+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel
+__kernel
+void extractFirstChannel
+(
+    const __global float4* img,
+    __global float* res,
+    int rows,
+    int cols,
+    int img_offset,
+    int res_offset,
+    int img_step,
+    int res_step
+)
+{
+    img_step   /= sizeof(float4);
+    res_step   /= sizeof(float);
+    img_offset /= sizeof(float4);
+    res_offset /= sizeof(float);
+    img += img_offset;
+    res += res_offset;
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if(gidx < cols && gidy < rows)
+    {
+        res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
+    }
+}
diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp

index 2fc6a10..5da7f01 100644 (file)
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
      }
  };
  
-TEST_P(MatchTemplate8U, DISABLED_Accuracy)
+TEST_P(MatchTemplate8U, Accuracy)
  {
  
      std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
@@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy)
      EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
  }
  
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U,
                          testing::Combine(
                              MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
                              testing::Values(Channels(1), Channels(3), Channels(4)),
                              ALL_TEMPLATE_METHODS
                          )
                         );
  
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine(
                              MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
                              testing::Values(Channels(1), Channels(3), Channels(4)),
                              testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
  #endif
author	yao <bitwangyaoyao@gmail.com>
	Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)
committer	yao <bitwangyaoyao@gmail.com>
	Tue, 26 Mar 2013 06:10:29 +0000 (14:10 +0800)
modules/ocl/src/match_template.cpp		patch \| blob \| history
modules/ocl/src/opencl/match_template.cl		patch \| blob \| history
modules/ocl/test/test_match_template.cpp		patch \| blob \| history