pooling ocl kernel optimization

author Li Peng <peng.li@intel.com>

Wed, 27 Jun 2018 07:15:56 +0000 (15:15 +0800)

committer Li Peng <peng.li@intel.com>

Fri, 29 Jun 2018 07:22:49 +0000 (15:22 +0800)
author Li Peng <peng.li@intel.com>
Wed, 27 Jun 2018 07:15:56 +0000 (15:15 +0800)
committer Li Peng <peng.li@intel.com>
Fri, 29 Jun 2018 07:22:49 +0000 (15:22 +0800)
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp

index 6a7c9d5..50948f5 100644 (file)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1446,7 +1446,7 @@ struct Net::Impl
              // TODO: OpenCL target support more fusion styles.
              if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
                  continue;
  
              Ptr<Layer>& currLayer = ld.layerInstance;
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp

index eab1dca..775a044 100644 (file)
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -165,6 +165,7 @@ public:
                                  (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                                 LIBDNN_POOLING_METHOD_STO);
              config.avePoolPaddedArea = avePoolPaddedArea;
+            config.computeMaxIdx = computeMaxIdx;
              config.use_half = use_half;
              poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
          }
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

index f3a26a3..e0ca5ca 100644 (file)
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -352,6 +352,7 @@ struct OCL4DNNPoolConfig
          pool_method(LIBDNN_POOLING_METHOD_MAX),
          global_pooling(false),
          avePoolPaddedArea(true),
+        computeMaxIdx(true),
          use_half(false)
      {}
      MatShape in_shape;
@@ -365,6 +366,7 @@ struct OCL4DNNPoolConfig
      ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
      bool global_pooling; // = false;
      bool avePoolPaddedArea;
+    bool computeMaxIdx;
      bool use_half;
  };
  
@@ -399,6 +401,7 @@ class OCL4DNNPool
          int32_t pooled_height_;
          int32_t pooled_width_;
          bool avePoolPaddedArea;
+        bool computeMaxIdx;
          bool use_half;
  };
  
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp

index 81238e9..b74bf4d 100644 (file)
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
      channels_ = config.channels;
      pool_method_ = config.pool_method;
      avePoolPaddedArea = config.avePoolPaddedArea;
+    computeMaxIdx = config.computeMaxIdx;
      use_half = config.use_half;
  
      for (int i = 0; i < spatial_dims; ++i)
@@ -97,7 +98,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                                   UMat& top_mask)
  {
      bool ret = true;
-    size_t global[] = { 128 * 128 };
+    size_t global[] = { (size_t)count_ };
      size_t local[] = { 128 };
  
      // support 2D case
@@ -105,8 +106,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
      {
      case LIBDNN_POOLING_METHOD_MAX:
          {
-            bool haveMask = !top_mask.empty();
-            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            String kname = computeMaxIdx ? "max_pool_forward_mask" : "max_pool_forward";
              kname += (use_half) ? "_half" : "_float";
              ocl::Kernel oclk_max_pool_forward(
                  kname.c_str(),
@@ -118,7 +118,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                         kernel_w_, kernel_h_,
                         stride_w_, stride_h_,
                         pad_w_, pad_h_,
-                       haveMask ? " -D HAVE_MASK=1" : ""
+                       computeMaxIdx ? " -D HAVE_MASK=1" : ""
                  ));
  
              if (oclk_max_pool_forward.empty())
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl

index e9d1d26..501f5a5 100644 (file)
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -65,36 +65,40 @@ __kernel void
  #endif
  )
  {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  Dtype maxval = -FLT_MAX;
+  int maxidx = -1;
+  int in_offset = ch * height * width;
+  for (int h = 0; h < KERNEL_H; ++h)
    {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * STRIDE_H - PAD_H;
-    int wstart = pw * STRIDE_W - PAD_W;
-    const int hend = min(hstart + KERNEL_H, height);
-    const int wend = min(wstart + KERNEL_W, width);
-    hstart = max(hstart, (int)0);
-    wstart = max(wstart, (int)0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    __global const Dtype* bottom_slice = bottom_data
-        + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
+    int off_y = hstart + h;
+    if (off_y >= 0 && off_y < height)
+    {
+      for (int w = 0; w < KERNEL_W; ++w)
+      {
+        int off_x = wstart + w;
+        if (off_x >= 0 && off_x < width)
+        {
+          Dtype val = bottom_data[in_offset + off_y * width + off_x];
+          maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
+          maxval = fmax(val, maxval);
          }
        }
      }
-    top_data[index] = maxval;
+  }
+  top_data[index] = maxval;
  #ifdef HAVE_MASK
-    mask[index] = maxidx;
+  mask[index] = maxidx;
  #endif
-  }
  }
  
  #elif defined KERNEL_AVE_POOL
@@ -105,43 +109,42 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
      const int pooled_height, const int pooled_width,
      __global Dtype* top_data)
  {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
-  {
-    {
-      const int pw = index % pooled_width;
-      const int ph = (index / pooled_width) % pooled_height;
-      const int c = (index / pooled_width / pooled_height) % channels;
-      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * STRIDE_H - PAD_H;
-      int wstart = pw * STRIDE_W - PAD_W;
-      int hend = min(hstart + KERNEL_H, height + PAD_H);
-      int wend = min(wstart + KERNEL_W, width + PAD_W);
-      int pool_size;
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  int hend = min(hstart + KERNEL_H, height + PAD_H);
+  int wend = min(wstart + KERNEL_W, width + PAD_W);
+  int pool_size;
  #ifdef AVE_POOL_PADDING_AREA
-      pool_size = (hend - hstart) * (wend - wstart);
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
  #else
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
-      pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
  #endif
-      Dtype aveval = 0;
-      __global const Dtype* bottom_slice = bottom_data
-          + (n * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += bottom_slice[h * width + w];
-        }
-      }
-      top_data[index] = aveval / pool_size;
+  Dtype aveval = 0;
+  int in_offset = ch * height * width;
+  for (int h = hstart; h < hend; ++h)
+  {
+    for (int w = wstart; w < wend; ++w)
+    {
+      aveval += bottom_data[in_offset + h * width + w];
      }
    }
+  top_data[index] = aveval / pool_size;
  }
  
  #elif defined KERNEL_STO_POOL
author	Li Peng <peng.li@intel.com>
	Wed, 27 Jun 2018 07:15:56 +0000 (15:15 +0800)
committer	Li Peng <peng.li@intel.com>
	Fri, 29 Jun 2018 07:22:49 +0000 (15:22 +0800)
modules/dnn/src/dnn.cpp		patch \| blob \| history
modules/dnn/src/layers/pooling_layer.cpp		patch \| blob \| history
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp		patch \| blob \| history
modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp		patch \| blob \| history
modules/dnn/src/opencl/ocl4dnn_pooling.cl		patch \| blob \| history