optimized cv::calcHist

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Mon, 9 Jun 2014 13:07:11 +0000 (17:07 +0400)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Tue, 10 Jun 2014 15:32:00 +0000 (19:32 +0400)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 9 Jun 2014 13:07:11 +0000 (17:07 +0400)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Tue, 10 Jun 2014 15:32:00 +0000 (19:32 +0400)
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp

index 71bd0e7de2d309903de56b04dc1e36efc8c602f9..92db67904378cf6d5cc91684c10141977658a4ee 100644 (file)
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1477,14 +1477,18 @@ enum
  
  static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S)
  {
-    int compunits = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int compunits = dev.maxComputeUnits();
+    size_t wgs = dev.maxWorkGroupSize();
      Size size = _src.size();
      bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0;
+    int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src));
  
      ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
-                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D cn=%d",
-                          BINS, compunits, wgs, use16 ? 16 : 1));
+                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s",
+                          BINS, compunits, wgs, kercn,
+                          kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)),
+                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
      if (k1.empty())
          return false;
  
@@ -1492,18 +1496,21 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32
      UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1),
              hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1);
  
-    k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
+    k1.args(ocl::KernelArg::ReadOnly(src),
+            ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
  
      size_t globalsize = compunits * wgs;
      if (!k1.run(1, &globalsize, &wgs, false))
          return false;
  
      ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
-                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
+                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d",
+                          BINS, compunits, (int)wgs));
      if (k2.empty())
          return false;
  
-    k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
+    k2.args(ocl::KernelArg::PtrReadOnly(ghist),
+            ocl::KernelArg::PtrWriteOnly(hist));
      if (!k2.run(1, &wgs, &wgs, false))
          return false;
  
diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl

index c0247a5ba225a34c62197cf32d4c5e7070f246cc..05cd42763f033a6bc6385d98e3cf743032e6ebbd 100644 (file)
--- a/modules/imgproc/src/opencl/histogram.cl
+++ b/modules/imgproc/src/opencl/histogram.cl
@@ -37,58 +37,78 @@
  //
  //
  
-#ifndef cn
-#define cn 1
+#ifndef kercn
+#define kercn 1
  #endif
  
-#if cn == 16
-#define T uchar16
-#else
+#ifndef T
  #define T uchar
  #endif
  
  __kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
-                                  __global uchar * hist, int total)
+                                  __global uchar * histptr, int total)
  {
      int lid = get_local_id(0);
-    int id = get_global_id(0) * cn;
+    int id = get_global_id(0) * kercn;
      int gid = get_group_id(0);
  
      __local int localhist[BINS];
  
+    #pragma unroll
      for (int i = lid; i < BINS; i += WGS)
          localhist[i] = 0;
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    for (int grain = HISTS_COUNT * WGS * cn; id < total; id += grain)
+    int src_index;
+
+    for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain)
      {
-        int src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols);
-#if cn == 1
-        atomic_inc(localhist + convert_int(src[src_index]));
+#ifdef HAVE_SRC_CONT
+        src_index = id;
  #else
+        src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols);
+#endif
+
+#if kercn == 1
+        atomic_inc(localhist + convert_int(src[src_index]));
+#elif kercn == 4
+        int value = *(__global const int *)(src + src_index);
+        atomic_inc(localhist + (value & 0xff));
+        atomic_inc(localhist + ((value >> 8) & 0xff));
+        atomic_inc(localhist + ((value >> 16) & 0xff));
+        atomic_inc(localhist + ((value >> 24) & 0xff));
+#elif kercn >= 2
          T value = *(__global const T *)(src + src_index);
-        atomic_inc(localhist + convert_int(value.s0));
-        atomic_inc(localhist + convert_int(value.s1));
-        atomic_inc(localhist + convert_int(value.s2));
-        atomic_inc(localhist + convert_int(value.s3));
-        atomic_inc(localhist + convert_int(value.s4));
-        atomic_inc(localhist + convert_int(value.s5));
-        atomic_inc(localhist + convert_int(value.s6));
-        atomic_inc(localhist + convert_int(value.s7));
-        atomic_inc(localhist + convert_int(value.s8));
-        atomic_inc(localhist + convert_int(value.s9));
-        atomic_inc(localhist + convert_int(value.sA));
-        atomic_inc(localhist + convert_int(value.sB));
-        atomic_inc(localhist + convert_int(value.sC));
-        atomic_inc(localhist + convert_int(value.sD));
-        atomic_inc(localhist + convert_int(value.sE));
-        atomic_inc(localhist + convert_int(value.sF));
+        atomic_inc(localhist + value.s0);
+        atomic_inc(localhist + value.s1);
+#if kercn >= 4
+        atomic_inc(localhist + value.s2);
+        atomic_inc(localhist + value.s3);
+#if kercn >= 8
+        atomic_inc(localhist + value.s4);
+        atomic_inc(localhist + value.s5);
+        atomic_inc(localhist + value.s6);
+        atomic_inc(localhist + value.s7);
+#if kercn == 16
+        atomic_inc(localhist + value.s8);
+        atomic_inc(localhist + value.s9);
+        atomic_inc(localhist + value.sA);
+        atomic_inc(localhist + value.sB);
+        atomic_inc(localhist + value.sC);
+        atomic_inc(localhist + value.sD);
+        atomic_inc(localhist + value.sE);
+        atomic_inc(localhist + value.sF);
+#endif
+#endif
+#endif
  #endif
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
+    __global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int));
+    #pragma unroll
      for (int i = lid; i < BINS; i += WGS)
-        *(__global int *)(hist + mad24(gid, BINS * (int)sizeof(int), i * (int)sizeof(int))) = localhist[i];
+        hist[i] = localhist[i];
  }
  
  __kernel void merge_histogram(__global const int * ghist, __global int * hist)
@@ -97,15 +117,16 @@ __kernel void merge_histogram(__global const int * ghist, __global int * hist)
  
      #pragma unroll
      for (int i = lid; i < BINS; i += WGS)
-        hist[i] = 0;
+        hist[i] = ghist[i];
      barrier(CLK_LOCAL_MEM_FENCE);
  
      #pragma unroll
-    for (int i = 0; i < HISTS_COUNT; ++i)
+    for (int i = 1; i < HISTS_COUNT; ++i)
      {
+        ghist += BINS;
          #pragma unroll
          for (int j = lid; j < BINS; j += WGS)
-            hist[j] += ghist[mad24(i, BINS, j)];
+            hist[j] += ghist[j];
          barrier(CLK_LOCAL_MEM_FENCE);
      }
  }
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Mon, 9 Jun 2014 13:07:11 +0000 (17:07 +0400)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Tue, 10 Jun 2014 15:32:00 +0000 (19:32 +0400)
modules/imgproc/src/histogram.cpp		patch \| blob \| history
modules/imgproc/src/opencl/histogram.cl		patch \| blob \| history