[IE CLDNN] Grouped convolution kernel improvement (#2063)

author Mikołaj Życzyński <mikolaj.zyczynski@intel.com>

Mon, 7 Sep 2020 08:52:06 +0000 (10:52 +0200)

committer GitHub <noreply@github.com>

Mon, 7 Sep 2020 08:52:06 +0000 (11:52 +0300)
author Mikołaj Życzyński <mikolaj.zyczynski@intel.com>
Mon, 7 Sep 2020 08:52:06 +0000 (10:52 +0200)
committer GitHub <noreply@github.com>
Mon, 7 Sep 2020 08:52:06 +0000 (11:52 +0300)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp

index dd193c2..bb4158f 100644 (file)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
@@ -124,6 +124,7 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
          MakeJitConstant("OWPAD", output.X().pad.Total()),
          MakeJitConstant("OHPAD", output.Y().pad.Total()),
          MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
+        MakeJitConstant("FSV", in_fsv),
      });
  
      if (params.filterSize.x != 3 || params.filterSize.y != 3) {
@@ -193,7 +194,8 @@ bool ConvolutionKernel_imad::Validate(const Params& params, const optional_param
      }
  
      auto& newParams = static_cast<const convolution_params&>(params);
-    if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0)
+    if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0 &&
+        newParams.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16)
          return false;
  
      size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl

index f470f17..b4f39ea 100644 (file)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
@@ -98,6 +98,10 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
      int w[NUM_FILTERS];
      int in_addr;
  
+#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+    int in_start_addr = INPUT0_GET_INDEX(batch, 0, input_y, input_x + sglid);
+#endif
+
  #ifdef BLOCK_LOAD_WEIGHTS
      int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4);
  #else
@@ -110,7 +114,11 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
      for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++)
      {
  #if INPUT0_LAYOUT_B_FS_YX_FSV16
+    #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+        int feature_location = kd * PACK + g * FILTER_IFM_NUM;
+    #else
          in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
+    #endif
  #else
      #ifdef BLOCK_LOAD_INPUTS
          in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
@@ -119,10 +127,20 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
      #endif
          in_addr += batch * input_size;  // adjust for batching
  #endif
+
          for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
  #if INPUT0_LAYOUT_B_FS_YX_FSV16
+        #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+            INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg];
+            in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV;
+            for (uint v = 0; v < PACK; v++) {
+                int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * INPUT0_FEATURE_PITCH * FSV  + (feature_location + v) % FSV;
+                input_int8_arr[v] = conv_input[in_addr + f_addr];                        
+            }
+        #else 
              in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
              in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
+         #endif       
  #else
      #ifdef BLOCK_LOAD_INPUTS
              in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp

index 5cc8c4b..04ca03f 100644 (file)
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
@@ -7168,6 +7168,15 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
                              // Input X size, Input Y size, Input Z size, Input features, Output features,
                              // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
                              // Input data format, Implementation name
+                            // Format: b_fs_yx_fsv16
+                            TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+                            TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
  
                              // Format: b_fs_yx_fsv4
                              TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
@@ -7188,7 +7197,7 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
                              TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
                              TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
                              TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
-                            
+
                              // Format: b_fs_zyx_fsv16
                              TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
                              TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
author	Mikołaj Życzyński <mikolaj.zyczynski@intel.com>
	Mon, 7 Sep 2020 08:52:06 +0000 (10:52 +0200)
committer	GitHub <noreply@github.com>
	Mon, 7 Sep 2020 08:52:06 +0000 (11:52 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp		patch \| blob \| history
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl		patch \| blob \| history
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp		patch \| blob \| history