dnn(ocl4dnn): refactor pooling OpenCL calls

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Thu, 23 Nov 2017 20:10:53 +0000 (20:10 +0000)

committer Alexander Alekhin <alexander.a.alekhin@gmail.com>

Thu, 23 Nov 2017 20:46:44 +0000 (20:46 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Thu, 23 Nov 2017 20:10:53 +0000 (20:10 +0000)
committer Alexander Alekhin <alexander.a.alekhin@gmail.com>
Thu, 23 Nov 2017 20:46:44 +0000 (20:46 +0000)
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

index 4a62546..9e202b4 100644 (file)
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -351,8 +351,6 @@ class OCL4DNNPool
                       UMat& top_data,
                       UMat& top_mask);
      private:
-        UMat mask_idx_;
-
          // Pooling parameters
          std::vector<int32_t> pad_;
          std::vector<int32_t> stride_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp

index fe8b84b..13434d9 100644 (file)
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -88,7 +88,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
  template<typename Dtype>
  OCL4DNNPool<Dtype>::~OCL4DNNPool()
  {
-    mask_idx_.release();
+    // nothing
  }
  
  template<typename Dtype>
@@ -99,99 +99,103 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
      bool ret = true;
      size_t global[] = { 128 * 128 };
      size_t local[] = { 128 };
-    cl_uint argIdx = 0;
  
      // support 2D case
      switch (pool_method_)
      {
      case LIBDNN_POOLING_METHOD_MAX:
          {
-            if (top_mask.empty() && mask_idx_.empty())
-            {
-                mask_idx_.create(1, count_, CV_32FC1);
-            }
-            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+            bool haveMask = !top_mask.empty();
+            ocl::Kernel oclk_max_pool_forward(
+                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_,
+                       haveMask ? " -D HAVE_MASK=1" : ""
+                ));
  
              if (oclk_max_pool_forward.empty())
                  return false;
  
-            argIdx = 0;
-            oclk_max_pool_forward.set(argIdx++, count_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_max_pool_forward.set(argIdx++, batch_size_);
-            oclk_max_pool_forward.set(argIdx++, channels_);
-            oclk_max_pool_forward.set(argIdx++, height_);
-            oclk_max_pool_forward.set(argIdx++, width_);
-            oclk_max_pool_forward.set(argIdx++, pooled_height_);
-            oclk_max_pool_forward.set(argIdx++, pooled_width_);
-            oclk_max_pool_forward.set(argIdx++, kernel_h_);
-            oclk_max_pool_forward.set(argIdx++, kernel_w_);
-            oclk_max_pool_forward.set(argIdx++, stride_h_);
-            oclk_max_pool_forward.set(argIdx++, stride_w_);
-            oclk_max_pool_forward.set(argIdx++, pad_h_);
-            oclk_max_pool_forward.set(argIdx++, pad_w_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
-            if (mask_idx_.empty())
-                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
-            else
-                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+            oclk_max_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top),
+                ocl::KernelArg::PtrWriteOnly(top_mask)
+            );
  
              ret = oclk_max_pool_forward.run(1, global, local, false);
          }
          break;
      case LIBDNN_POOLING_METHOD_AVE:
          {
+            CV_Assert(top_mask.empty());
+
              ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_
+                ));
  
              if (oclk_ave_pool_forward.empty())
                  return false;
  
-            argIdx = 0;
-            oclk_ave_pool_forward.set(argIdx++, count_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_ave_pool_forward.set(argIdx++, batch_size_);
-            oclk_ave_pool_forward.set(argIdx++, channels_);
-            oclk_ave_pool_forward.set(argIdx++, height_);
-            oclk_ave_pool_forward.set(argIdx++, width_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
-            oclk_ave_pool_forward.set(argIdx++, stride_h_);
-            oclk_ave_pool_forward.set(argIdx++, stride_w_);
-            oclk_ave_pool_forward.set(argIdx++, pad_h_);
-            oclk_ave_pool_forward.set(argIdx++, pad_w_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_ave_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );
  
              ret = oclk_ave_pool_forward.run(1, global, local, false);
          }
          break;
      case LIBDNN_POOLING_METHOD_STO:
          {
+            CV_Assert(top_mask.empty());
+
              ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_
+                ));
+
  
              if (oclk_sto_pool_forward.empty())
                  return false;
  
-            argIdx = 0;
-            oclk_sto_pool_forward.set(argIdx++, count_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_sto_pool_forward.set(argIdx++, batch_size_);
-            oclk_sto_pool_forward.set(argIdx++, channels_);
-            oclk_sto_pool_forward.set(argIdx++, height_);
-            oclk_sto_pool_forward.set(argIdx++, width_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
-            oclk_sto_pool_forward.set(argIdx++, stride_h_);
-            oclk_sto_pool_forward.set(argIdx++, stride_w_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_sto_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                batch_size_,
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );
  
              ret = oclk_sto_pool_forward.run(1, global, local, false);
          }
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl

index 326d5bc..218b6b4 100644 (file)
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -44,14 +44,23 @@
  #define TEMPLATE(name,type) CONCAT(name,type)
  #define Dtype float
  
-void TEMPLATE(max_pool_forward_impl, Dtype)(
+#if defined KERNEL_MAX_POOL
+
+__kernel void
+#ifdef HAVE_MASK
+    TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+    TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
      const int nthreads, __global const Dtype* bottom_data, const int num,
      const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data
+#ifdef HAVE_MASK
+    , __global Dtype* mask
+#endif
+)
  {
    for (int index = get_global_id(0); index < nthreads;
        index += get_global_size(0))
@@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
      const int ph = (index / pooled_width) % pooled_height;
      const int c = (index / pooled_width / pooled_height) % channels;
      const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
+    int hstart = ph * STRIDE_H - PAD_H;
+    int wstart = pw * STRIDE_W - PAD_W;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wend = min(wstart + KERNEL_W, width);
      hstart = max(hstart, (int)0);
      wstart = max(wstart, (int)0);
      Dtype maxval = -FLT_MAX;
@@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
        }
      }
      top_data[index] = maxval;
-    if (!no_mask) {
-      if (use_mask == 1) {
-        mask[index] = maxidx;
-      } else {
-        top_mask[index] = maxidx;
-      }
-    }
+#ifdef HAVE_MASK
+    mask[index] = maxidx;
+#endif
    }
  }
  
-__kernel void TEMPLATE(max_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* bottom_data, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask)
-{
-    TEMPLATE(max_pool_forward_impl, Dtype)(
-      nthreads, bottom_data, num, channels, height, width,
-      pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
-    );
-}
+#elif defined KERNEL_AVE_POOL
  
  __kernel void TEMPLATE(ave_pool_forward, Dtype)(
      const int nthreads, __global const Dtype* const bottom_data, const int num,
      const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, __global Dtype* top_data)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data)
  {
    for (int index = get_global_id(0); index < nthreads;
        index += get_global_size(0))
@@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
        const int ph = (index / pooled_width) % pooled_height;
        const int c = (index / pooled_width / pooled_height) % channels;
        const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * stride_h - pad_h;
-      int wstart = pw * stride_w - pad_w;
-      int hend = min(hstart + kernel_h, height + pad_h);
-      int wend = min(wstart + kernel_w, width + pad_w);
+      int hstart = ph * STRIDE_H - PAD_H;
+      int wstart = pw * STRIDE_W - PAD_W;
+      int hend = min(hstart + KERNEL_H, height + PAD_H);
+      int wend = min(wstart + KERNEL_W, width + PAD_W);
        const int pool_size = (hend - hstart) * (wend - wstart);
        hstart = max(hstart, (int)0);
        wstart = max(wstart, (int)0);
@@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
    }
  }
  
+#elif defined KERNEL_STO_POOL
+
  __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
      const int nthreads, __global const Dtype* const bottom_data, const int num,
      const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w,
+    const int pooled_height, const int pooled_width,
      __global Dtype* top_data)
  {
    for (int index = get_global_id(0); index < nthreads;
@@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
      const int ph = (index / pooled_width) % pooled_height;
      const int c = (index / pooled_width / pooled_height) % channels;
      const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
+    const int hstart = ph * STRIDE_H;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wstart = pw * STRIDE_W;
+    const int wend = min(wstart + KERNEL_W, width);
      // We set cumsum to be 0 to avoid divide-by-zero problems
      Dtype cumsum = FLT_MIN;
      Dtype cumvalues = 0.;
@@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
      // First pass: get sum
      for (int h = hstart; h < hend; ++h) {
        for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        Dtype v = bottom_slice[h * width + w];
+        cumsum += v;
+        cumvalues += v * v;
        }
      }
      top_data[index] = cumvalues / cumsum;
    }
  }
+
+#endif // KERNEL_*
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Thu, 23 Nov 2017 20:10:53 +0000 (20:10 +0000)
committer	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Thu, 23 Nov 2017 20:46:44 +0000 (20:46 +0000)
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp		patch \| blob \| history
modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp		patch \| blob \| history
modules/dnn/src/opencl/ocl4dnn_pooling.cl		patch \| blob \| history