From 20ef9a94236be4f38f21cc84818d38fc04348891 Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <Sergeishlyapnikov@gmail.com>
Date: Wed, 3 Jun 2020 13:42:15 +0300
Subject: [PATCH] [IE CLDNN] Improve kernel selection for b_fs_yx_fsv16 layout
 and optimize Convolution kernels (#730)

---
 .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp       | 12 ++++++---
 .../core/cl_kernels/convolution_gpu_bfyx_f16.cl    | 29 ++++++++++++----------
 .../cl_kernels/convolution_gpu_bfyx_f16_1x1.cl     | 18 +++-----------
 3 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
index b0284db..bcb6a1d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
@@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
 
+    const auto& input = params.inputs[0];
     const auto& out = params.output;
     auto x = out.X().v;
     auto y = out.Y().v;
@@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
     kd.lws1 = sub_group_size;
     kd.lws2 = 1;
 
+    auto bBlockSizeX = x % autoTune.blockWidth == 0;
+    auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
+    auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
+    
     if (b == 1) {
-        if (x <= 8)
+        if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
             kd.efficiency = FORCE_PRIORITY_1;
-        else
-            kd.efficiency = FORCE_PRIORITY_2;
+        } else {
+            kd.efficiency = FORCE_PRIORITY_3;
+        }
     } else {
         kd.efficiency = FORCE_PRIORITY_7;
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
index 0adfb29..6af3b27 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)(
     vec_t dst = INPUT0_VAL_ZERO;
 #endif  // BIAS_TERM
 
-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
-    for (uint g = group; g < group + groups_per_sub_group; g++) {
+#if MULTIPLE_GROUPS_INPUT_PRELOAD
+    const uint in_split_offset = f_block * input_fs_pitch;
+    const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
+#else
 #if GROUPED
+    for (uint g = group; g < group + groups_per_sub_group; g++) {
         const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
         const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
         const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
@@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)(
         const uint filter_offset = f_block * filter_os_pitch;
 #endif  // GROUPED
         const uint grouped_filter_offset = filter_offset + filter_split_offset;
-#else
-        const uint in_split_offset = f_block * input_fs_pitch;
-        const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
 #endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
 
         const uint grouped_input_offset = input_offset + in_split_offset;
@@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)(
                     vec_t src;
                     __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
                     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
+#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
+                        src[i] = line_cache[i];
+#else
                         src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
+#endif  // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
                     }
 #if MULTIPLE_GROUPS_INPUT_PRELOAD
                     typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t;
@@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)(
                 }
             }
         }
-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
+#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
     }
-#endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
+#endif  // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
     dst = ACTIVATION(dst, ACTIVATION_PARAMS);
 
     typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
@@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)(
     else
 #endif  // OUTPUT_LEFTOVERS
     {
-        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
 #if HAS_FUSED_OPS
             FUSED_OPS_VEC;
             res = FUSED_OPS_RESULT_VEC;
@@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)(
 #   error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
 #endif
         } else {
-            const int x_tail = OUTPUT_SIZE_X - x;
-            for (int i = 0; i < x_tail; i++) {
+            for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
 #if HAS_FUSED_OPS
                 FUSED_OPS_SCALAR;
                 res[i] = FUSED_OPS_RESULT_SCALAR;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
index 25a2b36..155ed59 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
 #endif
     {
 #if !PADDED_OUTPUT
-        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) {
-#if HAS_FUSED_OPS
-            FUSED_OPS_VEC;
-            dst = FUSED_OPS_RESULT_VEC;
-#endif
-#if X_BLOCK_SIZE == 8
-            UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 4
-            UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 2
-            UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#endif
-        } else {
+        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
 #else
-        if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) {
+#endif
 #if HAS_FUSED_OPS
             FUSED_OPS_VEC;
             dst = FUSED_OPS_RESULT_VEC;
@@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
             UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #endif
         } else {
-#endif
             for (int i = 0; i < X_BLOCK_SIZE; i++) {
                 if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
                     return;
-- 
2.7.4