enable concat layer fuse for OCL target

author Li Peng <peng.li@intel.com>

Thu, 12 Jul 2018 07:16:32 +0000 (15:16 +0800)

committer Li Peng <peng.li@intel.com>

Tue, 17 Jul 2018 04:46:16 +0000 (12:46 +0800)
author Li Peng <peng.li@intel.com>
Thu, 12 Jul 2018 07:16:32 +0000 (15:16 +0800)
committer Li Peng <peng.li@intel.com>
Tue, 17 Jul 2018 04:46:16 +0000 (12:46 +0800)
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp

index 011631f54e544ad4beff95ff9352f7d6b4ab6777..db905930b2e8881619446775c4f01345dc27143b 100644 (file)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1492,7 +1492,8 @@ struct Net::Impl
              // TODO: OpenCL target support more fusion styles.
              if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
+                 ld.layerInstance->type != "Concat")) )
                  continue;
  
              Ptr<Layer>& currLayer = ld.layerInstance;
@@ -1701,6 +1702,31 @@ struct Net::Impl
                  ld.outputBlobs.size() == 1 )
              {
                  Mat& output = ld.outputBlobs[0];
+                UMat umat_output;
+                if (!ld.outputBlobsWrappers.empty() &&
+                    (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    bool conv_layer = true;
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skip &&
+                              inp_i_data->inputBlobsId.size() == 1 &&
+                              inp_i_data->consumers.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
+                    }
+                    if (!conv_layer)
+                        continue;
+                    std::vector<UMat> umat_outputBlobs;
+                    umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
+                    umat_output = umat_outputBlobs[0];
+                }
  
                  // TODO: in general, this optimization can always be done, but
                  // many layers currently check that the input/output blobs are
@@ -1737,6 +1763,14 @@ struct Net::Impl
                          // Allocate new memory to prevent collisions during memory
                          // reusing (see https://github.com/opencv/opencv/pull/10456).
                          output = output.clone();
+                        if (preferableBackend == DNN_BACKEND_OPENCV &&
+                            IS_DNN_OPENCL_TARGET(preferableTarget))
+                        {
+                            std::vector<UMat> umats(1);
+                            umat_output = umat_output.clone();
+                            umats[0] = umat_output;
+                            OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
+                        }
                          Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
                          int ofs = 0;
                          for( i = 0; i < ninputs; i++ )
@@ -1753,6 +1787,12 @@ struct Net::Impl
                              CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
                              Mat* oldPtr = &curr_output;
                              curr_output = output_slice;
+                            if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
+                            {
+                                std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
+                                umats[pin.oid] = umat_output(chrange);
+                                OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
+                            }
                              // Layers that refer old input Mat will refer to the
                              // new data but the same Mat object.
                              CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output);
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

index a446f3bf159bff6e35b2cdd010454efe7bd785b2..c889c7d85cb437f7547b84c1fbcac218523a34f7 100644 (file)
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -821,7 +821,7 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
      cl_int err;
      size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
  
-    region.origin = offset * element_size;
+    region.origin = offset * element_size + buffer.offset;
      region.size = size * element_size;
      sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
                                  write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
@@ -853,6 +853,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
          return false;
  
      int32_t bias_offset;
+    int32_t element_size = use_half_ ? sizeof(short) : sizeof(float);
  
      if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
          if (!swizzleWeight(weight, config->workItem_output[2], false))
@@ -931,10 +932,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                      return false;
  
                  kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
              }
              else
              {
                  kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
              }
  
              kernel.set(argIdx++, (uint16_t)width_);
@@ -1024,10 +1027,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                      return false;
  
                  kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
              }
              else
              {
                  kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
              }
  
              kernel.set(argIdx++, (uint16_t)width_);
@@ -1079,6 +1084,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
          if (bias_term_)
              kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
          kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (int)(top.offset / element_size));
          kernel.set(argIdx++, (uint16_t)width_);
          kernel.set(argIdx++, (uint16_t)height_);
          kernel.set(argIdx++, (uint16_t)output_w_);
@@ -1126,6 +1132,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                      kernel.set(argIdx++, (void *)NULL);
                  kernel.set(argIdx++, bias_offset);
                  kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, (int)(top.offset / element_size));
                  kernel.set(argIdx++, output_image_offset);
                  kernel.set(argIdx++, (uint16_t)width_);
                  kernel.set(argIdx++, (uint16_t)height_);
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl

index 2cc161d3fffc63bd82f28d1231d92fd22aebf323..adeb38574eefc278c24be31ae7207ad840e79e25 100644 (file)
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -136,7 +136,8 @@ __kernel void ConvolveBasic(
      int kernel_offset,
      __global Dtype* bias,
      const int bias_offset,
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_base_offset,
      const int convolved_image_offset,
      const ushort input_width,
      const ushort input_height,
@@ -146,6 +147,7 @@ __kernel void ConvolveBasic(
      const ushort pad_h
  )
  {
+    __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
      const int outputX = get_global_id(0);
      const int outputY = get_global_id(1);
      const int kernelNum = get_global_id(2) * ZPAR;
@@ -220,12 +222,14 @@ convolve_simd(
      __global Dtype* inputs,
      __global Dtype* weights,
      BIAS_KERNEL_ARG
-    __global Dtype* outputs,
+    __global Dtype* outputs_base,
+    const int outputs_offset,
      const ushort input_width,
      const ushort input_height,
      const ushort output_width,
      const ushort output_height)
  {
+  __global Dtype* outputs = outputs_base + outputs_offset;
    unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
    unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
    unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
@@ -395,7 +399,8 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
      const __global Dtype *src0,   \
      const __global Dtype *src1,   \
      BIAS_KERNEL_ARG               \
-    __global Dtype *dst,          \
+    __global Dtype *dst_base,     \
+    const int dst_offset,         \
      const ushort input_width,     \
      const ushort input_height,    \
      const ushort output_width,    \
@@ -425,6 +430,7 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
  __attribute__((intel_reqd_sub_group_size(8)))
  __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  {
+    __global Dtype *dst = dst_base + dst_offset;
      const int group_x = get_group_id(0);
      const int group_y = get_group_id(1);
      const int global_x = get_global_id(0);
@@ -813,6 +819,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  __attribute__((intel_reqd_sub_group_size(8)))
  __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  {
+    __global Dtype *dst = dst_base + dst_offset;
      const int group_x = get_group_id(0);
      const int group_y = get_group_id(1);
      const int global_x = get_global_id(0);
@@ -1374,6 +1381,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  __attribute__((intel_reqd_sub_group_size(16)))
  __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  {
+    __global Dtype *dst = dst_base + dst_offset;
      const int group_x = get_group_id(0);
      const int group_y = get_group_id(1);
      const int global_x = get_global_id(0);
@@ -1559,6 +1567,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  __attribute__((intel_reqd_sub_group_size(16)))
  __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
  {
+    __global Dtype *dst = dst_base + dst_offset;
      const int group_x = get_group_id(0);
      const int group_y = get_group_id(1);
      const int global_x = get_global_id(0);
@@ -1770,12 +1779,13 @@ __kernel void DWCONV(
      __global Dtype* image_data,
      __global Dtype* kernel_data,
      BIAS_KERNEL_ARG
-    __global Dtype* convolved_image,
+    __global Dtype* convolved_image_base,
+    const int convolved_image_offset,
      const ushort input_width,
      const ushort input_height,
      const ushort output_width,
      const ushort output_height) {
-
+  __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
    const int outputX = get_global_id(0);
    const int outputY = get_global_id(1);
    const int outputZ = get_global_id(2);
author	Li Peng <peng.li@intel.com>
	Thu, 12 Jul 2018 07:16:32 +0000 (15:16 +0800)
committer	Li Peng <peng.li@intel.com>
	Tue, 17 Jul 2018 04:46:16 +0000 (12:46 +0800)
modules/dnn/src/dnn.cpp		patch \| blob \| history
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp		patch \| blob \| history
modules/dnn/src/opencl/conv_layer_spatial.cl		patch \| blob \| history