From: Li Peng <peng.li@intel.com>
Date: Wed, 27 Jun 2018 07:15:56 +0000 (+0800)
Subject: pooling ocl kernel optimization
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~608^2~6^2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=145eae321efe76ebf7a9c425368b48fafefbd1da;p=platform%2Fupstream%2Fopencv.git

pooling ocl kernel optimization

set global size with real output size, also optimize

max pooling index computation if necessary.

Signed-off-by: Li Peng <peng.li@intel.com>
---

diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 6a7c9d5..50948f5 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1446,7 +1446,7 @@ struct Net::Impl
             // TODO: OpenCL target support more fusion styles.
             if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                  (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
                 continue;
 
             Ptr<Layer>& currLayer = ld.layerInstance;
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index eab1dca..775a044 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -165,6 +165,7 @@ public:
                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                                LIBDNN_POOLING_METHOD_STO);
             config.avePoolPaddedArea = avePoolPaddedArea;
+            config.computeMaxIdx = computeMaxIdx;
             config.use_half = use_half;
             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
         }
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index f3a26a3..e0ca5ca 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -352,6 +352,7 @@ struct OCL4DNNPoolConfig
         pool_method(LIBDNN_POOLING_METHOD_MAX),
         global_pooling(false),
         avePoolPaddedArea(true),
+        computeMaxIdx(true),
         use_half(false)
     {}
     MatShape in_shape;
@@ -365,6 +366,7 @@ struct OCL4DNNPoolConfig
     ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
     bool global_pooling; // = false;
     bool avePoolPaddedArea;
+    bool computeMaxIdx;
     bool use_half;
 };
 
@@ -399,6 +401,7 @@ class OCL4DNNPool
         int32_t pooled_height_;
         int32_t pooled_width_;
         bool avePoolPaddedArea;
+        bool computeMaxIdx;
         bool use_half;
 };
 
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
index 81238e9..b74bf4d 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
     channels_ = config.channels;
     pool_method_ = config.pool_method;
     avePoolPaddedArea = config.avePoolPaddedArea;
+    computeMaxIdx = config.computeMaxIdx;
     use_half = config.use_half;
 
     for (int i = 0; i < spatial_dims; ++i)
@@ -97,7 +98,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                                  UMat& top_mask)
 {
     bool ret = true;
-    size_t global[] = { 128 * 128 };
+    size_t global[] = { (size_t)count_ };
     size_t local[] = { 128 };
 
     // support 2D case
@@ -105,8 +106,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
     {
     case LIBDNN_POOLING_METHOD_MAX:
         {
-            bool haveMask = !top_mask.empty();
-            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            String kname = computeMaxIdx ? "max_pool_forward_mask" : "max_pool_forward";
             kname += (use_half) ? "_half" : "_float";
             ocl::Kernel oclk_max_pool_forward(
                 kname.c_str(),
@@ -118,7 +118,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                        kernel_w_, kernel_h_,
                        stride_w_, stride_h_,
                        pad_w_, pad_h_,
-                       haveMask ? " -D HAVE_MASK=1" : ""
+                       computeMaxIdx ? " -D HAVE_MASK=1" : ""
                 ));
 
             if (oclk_max_pool_forward.empty())
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
index e9d1d26..501f5a5 100644
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -65,36 +65,40 @@ __kernel void
 #endif
 )
 {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  Dtype maxval = -FLT_MAX;
+  int maxidx = -1;
+  int in_offset = ch * height * width;
+  for (int h = 0; h < KERNEL_H; ++h)
   {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * STRIDE_H - PAD_H;
-    int wstart = pw * STRIDE_W - PAD_W;
-    const int hend = min(hstart + KERNEL_H, height);
-    const int wend = min(wstart + KERNEL_W, width);
-    hstart = max(hstart, (int)0);
-    wstart = max(wstart, (int)0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    __global const Dtype* bottom_slice = bottom_data
-        + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
+    int off_y = hstart + h;
+    if (off_y >= 0 && off_y < height)
+    {
+      for (int w = 0; w < KERNEL_W; ++w)
+      {
+        int off_x = wstart + w;
+        if (off_x >= 0 && off_x < width)
+        {
+          Dtype val = bottom_data[in_offset + off_y * width + off_x];
+          maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
+          maxval = fmax(val, maxval);
         }
       }
     }
-    top_data[index] = maxval;
+  }
+  top_data[index] = maxval;
 #ifdef HAVE_MASK
-    mask[index] = maxidx;
+  mask[index] = maxidx;
 #endif
-  }
 }
 
 #elif defined KERNEL_AVE_POOL
@@ -105,43 +109,42 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
     const int pooled_height, const int pooled_width,
     __global Dtype* top_data)
 {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
-  {
-    {
-      const int pw = index % pooled_width;
-      const int ph = (index / pooled_width) % pooled_height;
-      const int c = (index / pooled_width / pooled_height) % channels;
-      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * STRIDE_H - PAD_H;
-      int wstart = pw * STRIDE_W - PAD_W;
-      int hend = min(hstart + KERNEL_H, height + PAD_H);
-      int wend = min(wstart + KERNEL_W, width + PAD_W);
-      int pool_size;
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  int hend = min(hstart + KERNEL_H, height + PAD_H);
+  int wend = min(wstart + KERNEL_W, width + PAD_W);
+  int pool_size;
 #ifdef AVE_POOL_PADDING_AREA
-      pool_size = (hend - hstart) * (wend - wstart);
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
 #else
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
-      pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
 #endif
-      Dtype aveval = 0;
-      __global const Dtype* bottom_slice = bottom_data
-          + (n * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += bottom_slice[h * width + w];
-        }
-      }
-      top_data[index] = aveval / pool_size;
+  Dtype aveval = 0;
+  int in_offset = ch * height * width;
+  for (int h = hstart; h < hend; ++h)
+  {
+    for (int w = wstart; w < wend; ++w)
+    {
+      aveval += bottom_data[in_offset + h * width + w];
     }
   }
+  top_data[index] = aveval / pool_size;
 }
 
 #elif defined KERNEL_STO_POOL