[IE CLDNN] Support IC={1,2,4} in the first convolution kernel (#1583)
authorAlexander Chaiko <alexander.chaiko@intel.com>
Wed, 5 Aug 2020 15:32:32 +0000 (17:32 +0200)
committerGitHub <noreply@github.com>
Wed, 5 Aug 2020 15:32:32 +0000 (18:32 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp

index b41aba2..5df33de 100644 (file)
@@ -110,8 +110,8 @@ bool ConvolutionKernel_bfyx_to_bfyx_f16::Validate(const Params& p, const optiona
     const auto& input = params.inputs[0];
     const auto& output = params.output;
 
-    // TODO Add support for different input features number in kernel
-    if (input.Feature().v != 3) {
+    // Up to 4 input features allowed
+    if (input.Feature().v > 4) {
         return false;
     }
 
index 7c6b06d..400aa49 100644 (file)
@@ -18,6 +18,9 @@
 
 #define FEATURE_SLICE_SIZE 16
 
+// OUTPUT_X_BLOCK_SIZE is one of 2, 4, 8
+#define UNIT_BLOCK_WRITEN(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, OUTPUT_X_BLOCK_SIZE)(ptr, offset, val)
+
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
 KERNEL(convolution_bfyx_to_bfyx_f16)(
@@ -106,8 +109,8 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
     vec_t dst = UNIT_VAL_ZERO;
 #endif
 
-    UNIT_TYPE line_cache[3 * INPUT_BLOCK_SIZE];
-    for (int ic = 0; ic < 3; ic++)
+    UNIT_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE];
+    for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
     {
         __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))
         for (int i = 0; i < INPUT_BLOCK_SIZE; i++)
@@ -133,13 +136,12 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
         __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
         for (int kw = 0; kw < FILTER_SIZE_X; kw++)
         {
-            MAKE_VECTOR_TYPE(UNIT_TYPE, 2) wei = UNIT_BLOCK_READ2(weights, filter_offset +
-                                                                           kh * filter_y_pitch +
-                                                                           kw * filter_x_pitch);
-            UNIT_TYPE wei2 = UNIT_BLOCK_READ(weights, filter_offset +
-                                                      kh * filter_y_pitch +
-                                                      kw * filter_x_pitch +
-                                                      2 * filter_isv_pitch);
+            uint offset = filter_offset + kh * filter_y_pitch + kw * filter_x_pitch;
+
+            UNIT_TYPE wei[INPUT0_FEATURE_NUM];
+            __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+            for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
+                wei[ic] = UNIT_BLOCK_READ(weights, offset + ic * filter_isv_pitch);
 
             __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
             for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++)
@@ -147,13 +149,12 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
                 const uint buf_offset = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) / SUB_GROUP_SIZE;
                 const uint buf_group  = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) % SUB_GROUP_SIZE;
 
-                UNIT_TYPE src0 = intel_sub_group_shuffle(line_cache[0 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
-                UNIT_TYPE src1 = intel_sub_group_shuffle(line_cache[1 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
-                UNIT_TYPE src2 = intel_sub_group_shuffle(line_cache[2 * INPUT_BLOCK_SIZE + buf_offset], buf_group);
-
-                dst[i] = mad(wei[0], src0, dst[i]);
-                dst[i] = mad(wei[1], src1, dst[i]);
-                dst[i] = mad(wei2, src2, dst[i]);
+                UNIT_TYPE src[INPUT0_FEATURE_NUM];
+                __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
+                for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
+                    src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
+                    dst[i] = mad(wei[ic], src[ic], dst[i]);
+                }
             }
         }
     }
@@ -179,18 +180,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
             FUSED_OPS_VEC;
             dst = FUSED_OPS_RESULT_VEC;
 #endif
-            // TODO Generalize for other block sizes
-#if OUTPUT_X_BLOCK_SIZE == 8
-            UNIT_BLOCK_WRITE8(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 4
-            UNIT_BLOCK_WRITE4(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 2
-            UNIT_BLOCK_WRITE2(output, output_offset, dst);
-#elif OUTPUT_X_BLOCK_SIZE == 1
-            UNIT_BLOCK_WRITE(output, output_offset, dst);
-#else
-#   error convolution_gpu_bfyx_to_bfyx_f16.cl: Unsupported output x block size.
-#endif
+            UNIT_BLOCK_WRITEN(output, output_offset, dst);
         } else {
             const int x_tail = OUTPUT_SIZE_X - x;
             for (int i = 0; i < x_tail; i++) {
index c7494d1..db5cac2 100644 (file)
@@ -209,14 +209,17 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
         fmt_prev == format::bfyx &&
         ((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
         (fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
-        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
-        (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) ||
         (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
         (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
         (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
         return true;
 
     if (next.is_type<convolution>() &&
+        fmt_prev == format::bfyx &&
+        fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4)
+        return true;
+
+    if (next.is_type<convolution>() &&
         fmt_prev == format::b_fs_yx_fsv4 &&
         ((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
         (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
@@ -397,7 +400,7 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
     auto required_feature_num = weak_restrictions ? feature_block_size / 2 : feature_block_size;
     auto correct_in_feature = (input_layout.size.feature[0] >= required_feature_num &&
                                weights_layout.size.batch[0] * weights_layout.size.group[0] >= required_feature_num);
-    if (!correct_in_feature && input_layout.size.feature[0] == 3 && weights_layout.size.batch[0] >= feature_block_size)
+    if (!correct_in_feature && input_layout.size.feature[0] <= 4 && weights_layout.size.batch[0] >= feature_block_size)
         correct_in_feature = true;
     auto depthwise = conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]);  // depthwise conv
     auto out_features_per_group = weights_layout.size.batch[0];