Refactoring of OpenCL implementation

author Erik Karlsson <erik.r.karlsson@gmail.com>

Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)

committer Erik Karlsson <erik.r.karlsson@gmail.com>

Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)
author Erik Karlsson <erik.r.karlsson@gmail.com>
Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)
committer Erik Karlsson <erik.r.karlsson@gmail.com>
Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp

index 29899f7..30f638d 100644 (file)
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -51,7 +51,8 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
      Size src_size = _src.size();
      CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                 src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, false))
+               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
+                                        templateWindowSize, searchWindowSize, false))
  
      Mat src = _src.getMat();
      _dst.create(src_size, src.type());
@@ -95,7 +96,8 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
      Size src_size = _src.size();
      CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                 src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, true))
+               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
+                                        templateWindowSize, searchWindowSize, true))
  
      Mat src = _src.getMat();
      _dst.create(src_size, src.type());
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp

index 2fa11a3..a06dc61 100644 (file)
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -29,7 +29,7 @@ static int divUp(int a, int b)
  }
  
  template <typename FT, typename ST, typename WT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT *h, int hn, int cn,
                                        int & almostTemplateWindowSizeSqBinShift, bool abs)
  {
      const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
@@ -53,24 +53,32 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
      int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
          std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
      int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
-    FT den = 1.0f / (h * h * cn);
+    FT den[4];
+    CV_Assert(hn > 0 && hn <= 4);
+    for (int i=0; i<hn; i++)
+        den[i] = 1.0f / (h[i] * h[i] * cn);
  
-    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
  
+    char buf[40];
      ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s%s", ocl::typeToStr(depth),
+                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
+                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
+                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : ""));
      if (k.empty())
          return false;
  
      k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
-           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+           almostDist2ActualDistMultiplier, fixedPointMult,
+           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
  
      size_t globalsize[1] = { almostMaxDist };
      return k.run(1, globalsize, NULL, false);
  }
  
-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float *h, int hn,
                                       int templateWindowSize, int searchWindowSize, bool abs)
  {
      int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
@@ -89,18 +97,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
  
      char buf[4][40];
      String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D pixel_t=%s -D int_t=%s"
-                         " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
+                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
+                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                           " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                           " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
                           " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
                           templateWindowSize, searchWindowSize,
                           ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
-                         depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
+                         ocl::typeToStr(CV_32SC(hn)),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
+                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
+                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
                           depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
-                         (sprintf(buf[0], "long%d", cn), buf[0]),
+                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                           depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
-                         (sprintf(buf[1], "convert_long%d", cn), buf[1]),
+                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                           BLOCK_COLS, BLOCK_ROWS,
                           ctaSize, templateWindowHalfWize, searchWindowHalfSize,
                           ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
@@ -115,13 +127,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
      if ((depth == CV_8U &&
           !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
                                                         searchWindowSize, templateWindowSize,
-                                                       h, cn,
+                                                       h, hn, cn,
                                                         almostTemplateWindowSizeSqBinShift,
                                                         abs)) ||
          (depth == CV_16U &&
           !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
                                                            searchWindowSize, templateWindowSize,
-                                                          h, cn,
+                                                          h, hn, cn,
                                                            almostTemplateWindowSizeSqBinShift,
                                                            abs)))
          return false;
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl

index 11837a5..936aed6 100644 (file)
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -20,9 +20,9 @@
  
  #ifdef OP_CALC_WEIGHTS
  
-__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
+__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
                                      FT almostDist2ActualDistMultiplier, int fixedPointMult,
-                                    FT den, FT WEIGHT_THRESHOLD)
+                                    w_t den, FT WEIGHT_THRESHOLD)
  {
      int almostDist = get_global_id(0);
  
@@ -30,14 +30,13 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
      {
          FT dist = almostDist * almostDist2ActualDistMultiplier;
  #ifdef ABS
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den));
+        w_t w = exp((w_t)(-dist*dist) * den);
  #else
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
+        w_t w = exp((w_t)(-dist) * den);
  #endif
-        if (weight < WEIGHT_THRESHOLD * fixedPointMult)
-            weight = 0;
-
-        almostDist2Weight[almostDist] = weight;
+        wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
+        almostDist2Weight[almostDist] =
+            weight < WEIGHT_THRESHOLD * fixedPointMult ? (wlut_t)0 : weight;
      }
  }
  
@@ -208,14 +207,14 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
  }
  
  inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
-                           __local int * dists, __global const int * almostDist2Weight,
+                           __local int * dists, __global const wlut_t * almostDist2Weight,
                             __global uchar * dst, int dst_step, int dst_offset,
                             int y, int x, int id, __local weight_t * weights_local,
                             __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
  {
      int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
-    weight_t weights = 0;
-    sum_t weighted_sum = (sum_t)(0);
+    weight_t weights = (weight_t)0;
+    sum_t weighted_sum = (sum_t)0;
  
      for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
      {
@@ -223,10 +222,10 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
          sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
  
          int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
-        int weight = almostDist2Weight[almostAvgDist];
+        weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
  
-        weights += (weight_t)weight;
-        weighted_sum += (sum_t)(weight) * src_value;
+        weights += weight;
+        weighted_sum += (sum_t)weight * src_value;
      }
  
      weights_local[id] = weights;
@@ -251,13 +250,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
              weighted_sum_local[2] + weighted_sum_local[3];
          weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
  
-        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
      }
  }
  
  __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
                                     __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global uchar * buffer,
+                                   __global const wlut_t * almostDist2Weight, __global uchar * buffer,
                                     int almostTemplateWindowSizeSqBinShift)
  {
      int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
author	Erik Karlsson <erik.r.karlsson@gmail.com>
	Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)
committer	Erik Karlsson <erik.r.karlsson@gmail.com>
	Fri, 6 Mar 2015 18:07:13 +0000 (19:07 +0100)
modules/photo/src/denoising.cpp		patch \| blob \| history
modules/photo/src/fast_nlmeans_denoising_opencl.hpp		patch \| blob \| history
modules/photo/src/opencl/nlmeans.cl		patch \| blob \| history