From 20ef9a94236be4f38f21cc84818d38fc04348891 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 3 Jun 2020 13:42:15 +0300 Subject: [PATCH] [IE CLDNN] Improve kernel selection for b_fs_yx_fsv16 layout and optimize Convolution kernels (#730) --- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 12 ++++++--- .../core/cl_kernels/convolution_gpu_bfyx_f16.cl | 29 ++++++++++++---------- .../cl_kernels/convolution_gpu_bfyx_f16_1x1.cl | 18 +++----------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index b0284db..bcb6a1d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa auto autoTune = GetAutoTuneOptions(params, autoTuneIndex); kd.cldnnStyle.blockWidth = autoTune.blockWidth; + const auto& input = params.inputs[0]; const auto& out = params.output; auto x = out.X().v; auto y = out.Y().v; @@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa kd.lws1 = sub_group_size; kd.lws2 = 1; + auto bBlockSizeX = x % autoTune.blockWidth == 0; + auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0; + auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0; + if (b == 1) { - if (x <= 8) + if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) { kd.efficiency = FORCE_PRIORITY_1; - else - kd.efficiency = FORCE_PRIORITY_2; + } else { + kd.efficiency = FORCE_PRIORITY_3; + } } else { kd.efficiency = FORCE_PRIORITY_7; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl index 0adfb29..6af3b27 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)( vec_t dst = INPUT0_VAL_ZERO; #endif // BIAS_TERM -#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD - for (uint g = group; g < group + groups_per_sub_group; g++) { +#if MULTIPLE_GROUPS_INPUT_PRELOAD + const uint in_split_offset = f_block * input_fs_pitch; + const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group); + const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group); + const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH; +#else #if GROUPED + for (uint g = group; g < group + groups_per_sub_group; g++) { const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE); const uint filter_split_offset = g * FILTER_GROUPS_PITCH; const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch; @@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)( const uint filter_offset = f_block * filter_os_pitch; #endif // GROUPED const uint grouped_filter_offset = filter_offset + filter_split_offset; -#else - const uint in_split_offset = f_block * input_fs_pitch; - const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group); - const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group); - const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH; #endif // MULTIPLE_GROUPS_INPUT_PRELOAD const uint grouped_input_offset = input_offset + in_split_offset; @@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)( vec_t src; __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE))) for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { +#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1 + src[i] = line_cache[i]; +#else src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i]; +#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1 } #if MULTIPLE_GROUPS_INPUT_PRELOAD typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t; @@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)( } } } -#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD +#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD } -#endif // MULTIPLE_GROUPS_INPUT_PRELOAD +#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD dst = ACTIVATION(dst, ACTIVATION_PARAMS); typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t; @@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)( else #endif // OUTPUT_LEFTOVERS { - if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) { + if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) { #if HAS_FUSED_OPS FUSED_OPS_VEC; res = FUSED_OPS_RESULT_VEC; @@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)( # error convolution_gpu_bfyx_f16.cl: Unsupported output x block size. #endif } else { - const int x_tail = OUTPUT_SIZE_X - x; - for (int i = 0; i < x_tail; i++) { + for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) { #if HAS_FUSED_OPS FUSED_OPS_SCALAR; res[i] = FUSED_OPS_RESULT_SCALAR; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl index 25a2b36..155ed59 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl @@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( #endif { #if !PADDED_OUTPUT - if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) { -#if HAS_FUSED_OPS - FUSED_OPS_VEC; - dst = FUSED_OPS_RESULT_VEC; -#endif -#if X_BLOCK_SIZE == 8 - UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#elif X_BLOCK_SIZE == 4 - UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#elif X_BLOCK_SIZE == 2 - UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#endif - } else { + if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) { #else - if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) { + if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) { +#endif #if HAS_FUSED_OPS FUSED_OPS_VEC; dst = FUSED_OPS_RESULT_VEC; @@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); #endif } else { -#endif for (int i = 0; i < X_BLOCK_SIZE; i++) { if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) return; -- 2.7.4