Merge pull request #17885 from alalek:dnn_ocl_slice_update

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Mon, 3 Aug 2020 14:13:34 +0000 (17:13 +0300)

committer GitHub <noreply@github.com>

Mon, 3 Aug 2020 14:13:34 +0000 (14:13 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Mon, 3 Aug 2020 14:13:34 +0000 (17:13 +0300)
committer GitHub <noreply@github.com>
Mon, 3 Aug 2020 14:13:34 +0000 (14:13 +0000)
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp

index d7d541474e56c754ed972d0574e9acaa4e1479f3..9994677cb539331a2cddc63eb95d3cca787b5bd5 100644 (file)
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -160,6 +160,10 @@ public:
  
      void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
      {
+#ifdef HAVE_OPENCL
+        ocl_exec_cache.clear();
+#endif
+
          std::vector<Mat> inputs, outputs;
          inputs_arr.getMatVector(inputs);
          outputs_arr.getMatVector(outputs);
@@ -214,26 +218,33 @@ public:
      }
  
  #ifdef HAVE_OPENCL
-    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    struct OpenCLExecInfo
      {
-        std::vector<UMat> inputs;
-        std::vector<UMat> outputs;
+        std::string kernel_name;
+        std::string build_opts;
+        size_t local_size[2];
+        size_t global_size[2];
  
-        inputs_.getUMatVector(inputs);
-        outputs_.getUMatVector(outputs);
+        OpenCLExecInfo()
+        {
+            local_size[0] = local_size[1] = 0;
+            global_size[0] = global_size[1] = 0;
+        }
+    };
+    std::vector<OpenCLExecInfo> ocl_exec_cache;
+
+    void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
+    {
+        CV_TRACE_FUNCTION();
  
          CV_Assert(outputs.size() == finalSliceRanges.size());
+        ocl_exec_cache.resize(outputs.size());
  
          const UMat& input = inputs[0];
-        if (input.dims > 5)
-        {
-            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
-            return false;
-        }
+        const int dims = input.dims;
  
          size_t WSZ = 128;
  
-        const int dims = input.dims;
          const int elemSize = (int)input.elemSize();
          String opts0 = cv::format(
                  "-DDIMS=%d -DELEMSIZE=%d",
@@ -243,10 +254,11 @@ public:
          {
              opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
          }
-        String kname = cv::format("slice_%d", dims);
          for (size_t i = 0; i < outputs.size(); i++)
          {
-            UMat& output = outputs[i];
+            OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            const UMat& output = outputs[i];
              const std::vector<Range>& range = finalSliceRanges[i];
  
              String opts = opts0;
@@ -262,6 +274,8 @@ public:
                  CV_CheckEQ(range[d].size(), (int)output.size[d], "");
              }
  
+            const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
+
              int block_dims = 0;
              size_t block_size = elemSize;
              for (int i = dims - 1; i >= 0; --i)
@@ -270,12 +284,14 @@ public:
                      break;
                  block_size *= output.size[i];
                  block_dims++;
+                if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
+                    break;
              }
  
              const size_t total = output.total() * elemSize;
              size_t num_blocks = total / block_size;
  
-            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
              {
                  // use 1D copy mode
                  opts += cv::format(" -DUSE_COPY_1D=1");
@@ -345,23 +361,98 @@ public:
  
              opts += cv::format(" -DWSZ=%d", (int)WSZ);
  
-            size_t local[] = { WSZ, 1 };
-            size_t global[] = { WSZ, num_blocks };
+            std::ostringstream kernel_suffix;
+            kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
+            kernel_suffix << "__src_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.size[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';
+            /*for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.step[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';*/
  
-            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            kernel_suffix << "dst_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.size[dims - 1 - d] << '_';
+            }
+            /*kernel_suffix << '_';
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.step[dims - 1 - d] << '_';
+            }*/
+            kernel_suffix << "_slice_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << range[dims - 1 - d].start << '_';
+            }
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << '_' << range[dims - 1 - d].end;
+            }
+
+            std::string kernel_suffix_str = kernel_suffix.str();
+            opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
+
+            ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
+            ocl.build_opts = opts;
+            ocl.local_size[0] = WSZ;
+            ocl.local_size[1] = 1;
+            ocl.global_size[0] = WSZ;
+            ocl.global_size[1] = num_blocks;
+        }  // for outputs.size()
+    }  // ocl_prepare
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        CV_TRACE_FUNCTION();
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        const int dims = input.dims;
+        if (dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
+            return false;
+        }
+
+        if (ocl_exec_cache.empty())
+        {
+            ocl_prepare(inputs, outputs);
+        }
+        CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            const OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            UMat& output = outputs[i];
+
+            ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
              if (kernel.empty())
                  return false;
              bool ret = kernel.args(
                      ocl::KernelArg::PtrReadOnly(input),
                      ocl::KernelArg::PtrWriteOnly(output)
                  )
-                .run(2, global, local, false);
+                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
              if (!ret)
                  return false;
          }  // for outputs.size()
  
          return true;
-        }
+    }  // forward_ocl
  #endif
  
      void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl

index d468dbc16a499224628b0cef139ee37342091553..f32d66a9ca7da2be970d22752914e9f4d5187c74 100644 (file)
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -48,19 +48,85 @@ global: <WSZ, number_of_copy_blocks, 1>
  #define BLOCK_COLS_X4 (BLOCK_COLS / 4)
  #define BLOCK_COLS_X16 (BLOCK_COLS / 16)
  
-#ifdef USE_COPY_1D
-
-static inline
-__attribute__((always_inline))
-void copy_block_1d(
+__attribute__((reqd_work_group_size(WSZ, 1, 1)))
+__kernel void
+CONCAT(slice_, SLICE_KERNEL_SUFFIX)(
      __global const uchar* src0,
-    const uint src_offset,
-    __global uchar* dst0,
-    const uint dst_offset
+    __global uchar* dst0
  )
  {
-    __global const uchar* src = src0 + src_offset;
-    __global uchar* dst = dst0 + dst_offset;
+    uint block_id = get_global_id(1);
+    uint dst_offset0 = block_id * BLOCK_SIZE;
+    uint src_offset0 = 0;
+
+    {  // calculate src_offset0
+
+#define CALC_SRC_INDEX(dim) \
+    { \
+    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
+    CONCAT(idx_, dim) = block_id / plane_sz; \
+    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
+    }
+#define UPDATE_SRC_OFFSET(dim) \
+    src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);
+/*
+    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
+        printf("(%d, %d): @%d src_offset0=%d   idx_dim=%d   block_id=%d\n", \
+            get_global_id(0), get_global_id(1), \
+            dim, src_offset0, CONCAT(idx_, dim), block_id \
+        );
+*/
+
+#if DIMS > 5
+#error "invalid configuration"
+#endif
+#if DIMS > 4
+    uint idx_4 = 0;
+#if BLOCK_DIMS <= 4
+    CALC_SRC_INDEX(4)
+#endif
+    UPDATE_SRC_OFFSET(4)
+#endif
+#if DIMS > 3
+    uint idx_3 = 0;
+#if BLOCK_DIMS <= 3
+    CALC_SRC_INDEX(3)
+#endif
+    UPDATE_SRC_OFFSET(3)
+#endif
+#if DIMS > 2
+    uint idx_2 = 0;
+#if BLOCK_DIMS <= 2
+    CALC_SRC_INDEX(2)
+#endif
+    UPDATE_SRC_OFFSET(2)
+#endif
+#if DIMS > 1
+    uint idx_1 = 0;
+#if BLOCK_DIMS <= 1
+    CALC_SRC_INDEX(1)
+#endif
+    UPDATE_SRC_OFFSET(1)
+#endif
+#if DIMS > 0
+    uint idx_0 = 0;
+    UPDATE_SRC_OFFSET(0)
+#endif
+
+/*
+    if (get_global_id(0) == 0)
+        printf("(%d, %d): src_offset0=%d dst_offset0=%d\n",
+            get_global_id(0), get_global_id(1),
+            src_offset0, dst_offset0
+        );
+*/
+
+    }  // calculate src_offset0
+
+#ifdef USE_COPY_1D
+    {  // copy_block_1d
+    __global const uchar* src = src0 + src_offset0;
+    __global uchar* dst = dst0 + dst_offset0;
  
      uint processed = 0;
  
@@ -70,8 +136,9 @@ void copy_block_1d(
          uint i = get_local_id(0) * 16;  // uchar16
          while (i < BLOCK_COLS_X16 * 16)
          {
-            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));
  
              uchar16 a0 = vload16(0, src + idx.s0);
              uchar16 a1 = vload16(0, src + idx.s1);
@@ -97,8 +164,9 @@ void copy_block_1d(
          uint i = get_local_id(0) * 4 + processed;  // uchar4
          while (i < BLOCK_COLS_X4 * 4)
          {
-            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+            uint4 idx0 = (uint4)i;
+            uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);
+            idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));
  
              uchar4 a0 = vload4(0, src + idx.s0);
              uchar4 a1 = vload4(0, src + idx.s1);
@@ -130,19 +198,11 @@ void copy_block_1d(
          }
      }
  #endif
-}
+    }  // copy_block_1d
  
-#else  // USE_COPY_1D
+#else
  
-static inline
-__attribute__((always_inline))
-void copy_block_2d(
-    __global const uchar* src0,
-    const uint src_offset0,
-    __global uchar* dst0,
-    const uint dst_offset0
-)
-{
+    {  // copy_block_2d
      __global const uchar* src = src0 + src_offset0;
      __global uchar* dst = dst0 + dst_offset0;
  
@@ -199,85 +259,6 @@ void copy_block_2d(
  #endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
          i += WSZ * 4;
      }
-}
-
-#endif  // USE_COPY_1D
-
-__kernel void
-CONCAT(slice_, DIMS)(
-    __global const uchar* src,
-    __global uchar* dst
-)
-{
-    uint block_id = get_global_id(1);
-
-    uint dst_offset = block_id * BLOCK_SIZE;
-
-    uint src_offset = 0;
-
-#define CALC_SRC_INDEX(dim) \
-    { \
-    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
-    CONCAT(idx_, dim) = block_id / plane_sz; \
-    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
-    }
-#define UPDATE_SRC_OFFSET(dim) \
-    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
-/*
-    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
-        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
-            get_global_id(0), get_global_id(1), \
-            dim, src_offset, CONCAT(idx_, dim), block_id \
-        );
-*/
-
-#if DIMS > 5
-#error "invalid configuration"
-#endif
-#if DIMS > 4
-    uint idx_4 = 0;
-#if BLOCK_DIMS <= 4
-    CALC_SRC_INDEX(4)
-#endif
-    UPDATE_SRC_OFFSET(4)
-#endif
-#if DIMS > 3
-    uint idx_3 = 0;
-#if BLOCK_DIMS <= 3
-    CALC_SRC_INDEX(3)
-#endif
-    UPDATE_SRC_OFFSET(3)
-#endif
-#if DIMS > 2
-    uint idx_2 = 0;
-#if BLOCK_DIMS <= 2
-    CALC_SRC_INDEX(2)
-#endif
-    UPDATE_SRC_OFFSET(2)
-#endif
-#if DIMS > 1
-    uint idx_1 = 0;
-#if BLOCK_DIMS <= 1
-    CALC_SRC_INDEX(1)
-#endif
-    UPDATE_SRC_OFFSET(1)
-#endif
-#if DIMS > 0
-    uint idx_0 = 0;
-    UPDATE_SRC_OFFSET(0)
-#endif
-
-/*
-    if (get_global_id(0) == 0)
-        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
-            get_global_id(0), get_global_id(1),
-            src_offset, dst_offset
-        );
-*/
-
-#ifdef USE_COPY_1D
-    copy_block_1d(src, src_offset, dst, dst_offset);
-#else
-    copy_block_2d(src, src_offset, dst, dst_offset);
+    }  // copy_block_2d
  #endif
  }
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Mon, 3 Aug 2020 14:13:34 +0000 (17:13 +0300)
committer	GitHub <noreply@github.com>
	Mon, 3 Aug 2020 14:13:34 +0000 (14:13 +0000)
modules/dnn/src/layers/slice_layer.cpp		patch \| blob \| history
modules/dnn/src/opencl/slice.cl		patch \| blob \| history