add depthwise convolution kernel
authorLi Peng <peng.li@intel.com>
Tue, 19 Dec 2017 09:59:13 +0000 (17:59 +0800)
committerLi Peng <peng.li@intel.com>
Tue, 19 Dec 2017 09:59:13 +0000 (17:59 +0800)
Signed-off-by: Li Peng <peng.li@intel.com>
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
modules/dnn/src/opencl/conv_layer_spatial.cl

index b137896..f9a74ae 100644 (file)
@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
         bool createGEMMLikeConvKernel(int32_t blockWidth,
                                       int32_t blockHeight,
                                       int32_t blockDepth);
+        bool createDWConvKernel(int32_t blockWidth,
+                                int32_t blockHeight,
+                                int32_t blockDepth);
         void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
                              int32_t offset, int32_t size, bool write_only);
         bool convolve(const UMat &bottom, UMat &top,
@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
         int32_t M_;
 
         bool tuned_;
+        bool dwconv_;
+
         std::string key_, key_sanitized_;
         std::string short_key_;
         std::string kernel_name_;
index 6a30555..ae188f7 100644 (file)
@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     top_dim_ = num_output_ * output_w_ * output_h_;
 
     cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
+    dwconv_ = (num_output_ == channels_ && channels_ == group_);
 
     use_cache_path_ = false;
     if (!cache_path_.empty())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 typedef enum {
     KERNEL_TYPE_INTEL_IDLF = 2,
     KERNEL_TYPE_BASIC = 4,
-    KERNEL_TYPE_GEMM_LIKE = 5
+    KERNEL_TYPE_GEMM_LIKE = 5,
+    KERNEL_TYPE_DWCONV = 6
 } ocl4dnnConvSpatialKernelType_t;
 
 template<typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         if (clOptionSupport("-cl-no-subgroup-ifp"))
             options_ << " -cl-no-subgroup-ifp ";
 
+        addDef("KERNEL_GEMM_LIKE");
         addDef("INPUT_DEPTH", channels_);
         addDef("WIDTH1", M_);
         addDef("OUT_PADDING_LEFT", 0);
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         setFusionDefine(fused_activ_, fused_eltwise_);
         src_ = ocl::dnn::conv_layer_spatial_oclsrc;
     }
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+    {
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
+        kernel_name_ = "DWCONV_";
+        kernel_name_ += kernelUKey.c_str();
+
+        options_ << " -cl-fast-relaxed-math ";
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        addDef("KERNEL_DWCONV");
+        addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
+        addDef("KERNEL_W", kernel_w_);
+        addDef("KERNEL_H", kernel_h_);
+        addDef("APPLY_BIAS", bias_term_);
+        addDef("OUTPUT_Z", num_output_ * num_);
+        addDef("CHANNELS", num_output_);
+        setFusionDefine(fused_activ_, fused_eltwise_);
+
+        options_ << " -D DWCONV=" << kernel_name_;
+        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+    }
 }
 
 template<typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
             }
         }
+    } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
+        ocl::Kernel kernel(config->kernelName.c_str(), program);
+        if (kernel.empty())
+            return false;
+
+        cl_uint argIdx = 0;
+        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        if (bias_term_)
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (uint16_t)width_);
+        kernel.set(argIdx++, (uint16_t)height_);
+        kernel.set(argIdx++, (uint16_t)output_w_);
+        kernel.set(argIdx++, (uint16_t)output_h_);
+
+        size_t global_size[3];
+        global_size[0] = output_w_;
+        global_size[1] = output_h_;
+        global_size[2] = num_output_ * num_;
+
+        if (!kernel.run(3, global_size, NULL, false))
+        {
+            std::cout << "DWCONV kernel run failed." << std::endl;
+            return false;
+        }
     } else {
         for (int32_t n = 0; n < numImages; ++n) {
             for (int32_t g = 0; g < group_; ++g) {
@@ -1223,6 +1275,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
 }
 
 template<>
+bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
+                                                   int32_t blockHeight,
+                                                   int32_t blockDepth)
+{
+    if (!dwconv_)
+        return false;
+
+    int workItemOutput[3] = { 1, 1, 1 };
+    size_t local_size[3] = { 1, 1, 1 };
+    size_t global_size[3];
+    global_size[0] = divUp(output_w_, workItemOutput[0]);
+    global_size[1] = divUp(output_h_, workItemOutput[1]);
+    global_size[2] = divUp(M_ * num_, workItemOutput[2]);
+
+    kernelType_ = KERNEL_TYPE_DWCONV;
+    blockM_ = blockWidth;
+    blockK_ = blockHeight;
+    blockN_ = blockDepth;
+
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
+                              &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
+        return true;
+    }
+    else
+        return false;
+}
+
+template<>
 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
                                                         int32_t blockWidth,
                                                         int32_t blockHeight,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
         return createBasicKernel(blockWidth, blockHeight, blockDepth);
     else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
         return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+        return createDWConvKernel(blockWidth, blockHeight, blockDepth);
     else
         CV_Assert(0 && "Internal error");
     return false;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
 template<>
 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
 {
-    if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+    if (ocl::Device::getDefault().intelSubgroupsSupport())
+    {
+        //depth_wise kernels
+        if (dwconv_)
+        {
+            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
+            if (group_ > 8)
+                return;
+        }
+
         /* IDLF kernels are using Intel specific extension which make
            them intel only. */
         // Generates static key_
index 91066bd..2457cf7 100644 (file)
@@ -383,7 +383,7 @@ convolve_simd(
   }
 }
 
-#else // KERNEL_GEMM_LIKE
+#elif defined KERNEL_GEMM_LIKE
 
 #if APPLY_BIAS
 // Dtype bias[4];
@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
-#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
+
+#elif defined KERNEL_DWCONV
+
+__kernel void DWCONV(
+    ELTWISE_DATA_ARG
+    NEGATIVE_SLOPE_ARG
+    __global Dtype* image_data,
+    __global Dtype* kernel_data,
+    BIAS_KERNEL_ARG
+    __global Dtype* convolved_image,
+    const ushort input_width,
+    const ushort input_height,
+    const ushort output_width,
+    const ushort output_height) {
+
+  const int outputX = get_global_id(0);
+  const int outputY = get_global_id(1);
+  const int outputZ = get_global_id(2);
+  if(outputX < output_width && outputY < output_height)
+  {
+    Dtype sum = 0.;
+
+    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
+    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
+    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
+    const int biasIndex=outputZ%CHANNELS;
+    const int local_image_offset = org_y*input_width + org_x;
+    const int imageSize = input_width*input_height;
+
+    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
+    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
+
+    for(int y = 0; y < KERNEL_H; y++)
+    {
+      for(int x = 0; x < KERNEL_W; x++)
+      {
+        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
+        {
+          continue;
+        }
+        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
+      }
+      image_dataPtrFloat += input_width * DILATION_Y;
+      kernel_dataPtrFloat += KERNEL_W;
+    }
+
+    #if APPLY_BIAS
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
+    #else
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
+    #endif
+  }
+}
+#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV